From a488113062b7231197ace8522ab3cab535c77d0b Mon Sep 17 00:00:00 2001
From: Shangdi Yu <shangdiy@meta.com>
Date: Wed, 31 Jul 2024 23:28:24 +0000
Subject: [PATCH] [AOTI] Fix bfloat16 in CPU (#132150)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #122986

- add "typedef at::BFloat16 bfloat16;" to the header of generated cpp file

- Supress warning: comparison of integer expressions of different signedness: ‘long unsigned int’ and ‘int64_t’ {aka ‘long int’} [-Wsign-compare]
  436 |   if (tensor.numel() != numel) {

Pull Request resolved: https://github.com/pytorch/pytorch/pull/132150
Approved by: https://github.com/chenyang78, https://github.com/desertfire
---
 test/inductor/test_aot_inductor.py                      | 5 -----
 torch/_inductor/codegen/aoti_runtime/implementation.cpp | 2 +-
 torch/_inductor/codegen/cpp_wrapper_cpu.py              | 2 ++
 torch/_inductor/codegen/cpp_wrapper_cuda.py             | 2 --
 4 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index d65e9f2f25595..95829968c6572 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -3252,11 +3252,6 @@ def fail_non_abi_compatible_cuda(is_skip=False):
     "test_index_put_with_none_index": fail_minimal_arrayref_interface(is_skip=True),
     # FIXME: failed with Segfault while exiting the Python runtime
     "test_constant": fail_stack_allocation(is_skip=True),
-    # C++ compile error, need for aoti_torch___scaled_dot_product_flash_attention_for_cpu
-    # https://github.com/pytorch/pytorch/issues/122986
-    "test_sdpa": fail_with_and_without_stack_allocation(is_skip=True),
-    # The same issue as https://github.com/pytorch/pytorch/issues/122986
-    "test_sdpa_2": fail_with_and_without_stack_allocation(is_skip=True),
     # Looks like the same issue as https://github.com/pytorch/pytorch/issues/122978
     "test_shifted_constraint_ranges": fail_with_and_without_stack_allocation(
         is_skip=True
diff --git a/torch/_inductor/codegen/aoti_runtime/implementation.cpp b/torch/_inductor/codegen/aoti_runtime/implementation.cpp
index 4869825cadabd..0273aa9aa8df0 100644
--- a/torch/_inductor/codegen/aoti_runtime/implementation.cpp
+++ b/torch/_inductor/codegen/aoti_runtime/implementation.cpp
@@ -76,7 +76,7 @@ void convert_handles_to_inputs(
 }
 
 template <typename T>
-void assert_numel(const ArrayRefTensor<T>& tensor, int64_t numel) {
+void assert_numel(const ArrayRefTensor<T>& tensor, uint64_t numel) {
   if (tensor.numel() != numel) {
     std::stringstream err;
     err << "incorrect numel for input tensor. expected " << numel << ", got " << tensor.numel();
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index b95772ebc4cd3..7123f3d0ed8d1 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -170,6 +170,8 @@ def write_header(self):
                     #include <torch/csrc/inductor/aoti_runtime/model.h>
                     """
                 )
+            self.header.splice("typedef at::Half half;")
+            self.header.splice("typedef at::BFloat16 bfloat16;")
         else:
             self.header.splice(
                 """
diff --git a/torch/_inductor/codegen/cpp_wrapper_cuda.py b/torch/_inductor/codegen/cpp_wrapper_cuda.py
index 8eed428de07a9..bb6c508233c9d 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cuda.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cuda.py
@@ -43,8 +43,6 @@ def write_header(self):
         super().write_header()
 
         self.header.splice("#include <filesystem>")
-        self.header.splice("typedef at::Half half;")
-        self.header.splice("typedef at::BFloat16 bfloat16;")
         if config.abi_compatible:
             self.header.splice(
                 "#include <torch/csrc/inductor/aoti_runtime/utils_cuda.h>"