Windows: Enable building for latest torch-xpu-ops (#865)

In latest torch-xpu-ops, we have split libtorch_xpu.so into multiple libraries to avoid a binary with large size. The change of building system leads to two issues on Windows. The PR is to resolve them. 1) Cyclic dependence, 2) Symbols visibility. Signed-off-by: Feng Yuan <[email protected]>
intel · Sep 5, 2024 · 6636a06 · 6636a06
1 parent d49f1b4
commit 6636a06
Show file tree

Hide file tree

Showing 254 changed files with 970 additions and 479 deletions.
diff --git a/cmake/Modules/FindSYCL/run_sycl.cmake b/cmake/Modules/FindSYCL/run_sycl.cmake
@@ -56,6 +56,12 @@ endforeach()
 # Choose host flags in FindSYCL.cmake
 @SYCL_host_flags@
 
+# Adding permissive flag for MSVC build to overcome ambiguous symbol error.
+if(WIN32)
+  string(APPEND SYCL_host_compiler_flags "/permissive- ")
+endif()
+
+
 list(REMOVE_DUPLICATES CMAKE_HOST_FLAGS)
 foreach(flag ${CMAKE_HOST_FLAGS})
   # Extra quotes are added around each flag to help SYCL parse out flags with spaces.

diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
@@ -1,10 +1,13 @@
 # ATen XPU sources
 
-file(GLOB xpu_cpp "xpu/*.cpp" "native/xpu/*.cpp" "native/sparse/*.cpp")
+file(GLOB xpu_cpp "xpu/*.cpp")
+file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp")
 file(GLOB xpu_sycl "native/xpu/sycl/*.cpp")
 
 list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
+list(APPEND ATen_XPU_NATIVE_CPP_SRCS ${xpu_native_cpp})
 list(APPEND ATen_XPU_SYCL_SRCS ${xpu_sycl})
 
 set(ATen_XPU_CPP_SRCS ${ATen_XPU_CPP_SRCS} PARENT_SCOPE)
+set(ATen_XPU_NATIVE_CPP_SRCS ${ATen_XPU_NATIVE_CPP_SRCS} PARENT_SCOPE)
 set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
@@ -29,7 +29,7 @@ Tensor& XPUNativeFunctions::arange_out(
 
         TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
         TORCH_CHECK(
-            std::isfinite(xstart) && std::isfinite(xend),
+            std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
             "unsupported range: ",
             xstart,
             " -> ",
@@ -97,7 +97,7 @@ Tensor& XPUNativeFunctions::range_out(
 
   TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
   TORCH_CHECK(
-      std::isfinite(xstart) && std::isfinite(xend),
+      std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
       "unsupported range: ",
       xstart,
       " -> ",

diff --git a/src/ATen/native/xpu/sycl/AbsKernel.cpp b/src/ATen/native/xpu/sycl/AbsKernel.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/AbsKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/AbsKernel.h b/src/ATen/native/xpu/sycl/AbsKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void abs_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void abs_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationEluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename opmath_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.h b/src/ATen/native/xpu/sycl/ActivationEluKernels.h
@@ -4,13 +4,13 @@
 
 namespace at::native::xpu {
 
-void elu_kernel(
+TORCH_XPU_API void elu_kernel(
     TensorIteratorBase& iter,
     const Scalar& alpha,
     const Scalar& scale,
     const Scalar& input_scale);
 
-void elu_backward_kernel(
+TORCH_XPU_API void elu_backward_kernel(
     TensorIteratorBase& iter,
     const Scalar& alpha,
     const Scalar& scale,

diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationGeluKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {

diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.h b/src/ATen/native/xpu/sycl/ActivationGeluKernel.h
@@ -6,9 +6,11 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void gelu_kernel(TensorIteratorBase& iter, c10::string_view approximate);
+TORCH_XPU_API void gelu_kernel(
+    TensorIteratorBase& iter,
+    c10::string_view approximate);
 
-void gelu_backward_kernel(
+TORCH_XPU_API void gelu_backward_kernel(
     TensorIteratorBase& iter,
     c10::string_view approximate);
 

diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
@@ -2,9 +2,12 @@
 #include <ATen/OpMathType.h>
 #include <ATen/TensorIterator.h>
 
+#include <ATen/native/xpu/sycl/Loops.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/ActivationGluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.h b/src/ATen/native/xpu/sycl/ActivationGluKernels.h
@@ -4,9 +4,9 @@
 
 namespace at::native::xpu {
 
-void glu_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void glu_kernel(TensorIteratorBase& iter);
 
-void glu_backward_kernel(
+TORCH_XPU_API void glu_backward_kernel(
     const TensorIteratorBase& iter,
     int64_t gI_stride,
     int64_t I_stride);

diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename opmath_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void hardsigmoid_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void hardsigmoid_kernel(TensorIteratorBase& iter);
 
-void hardsigmoid_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void hardsigmoid_backward_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
@@ -6,6 +6,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationHardswishKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {

diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h
@@ -6,9 +6,9 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void hardswish_kernel(TensorIterator& iter);
+TORCH_XPU_API void hardswish_kernel(TensorIterator& iter);
 
-void hardswish_backward_kernel(TensorIterator& iter);
+TORCH_XPU_API void hardswish_backward_kernel(TensorIterator& iter);
 
 } // namespace xpu
 } // namespace native

diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationHardtanhKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {

diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void hardtanh_backward_kernel(
+TORCH_XPU_API void hardtanh_backward_kernel(
     TensorIterator& iter,
     const Scalar& min,
     const Scalar& max);

diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp
@@ -6,6 +6,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationLeakyReluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h
@@ -4,9 +4,11 @@
 
 namespace at::native::xpu {
 
-void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_);
+TORCH_XPU_API void leaky_relu_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& negval_);
 
-void leaky_relu_backward_kernel(
+TORCH_XPU_API void leaky_relu_backward_kernel(
     TensorIteratorBase& iter,
     const Scalar& negval_);
 

diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp
@@ -6,6 +6,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void log_sigmoid_forward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log_sigmoid_forward_kernel(TensorIteratorBase& iter);
 
-void log_sigmoid_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log_sigmoid_backward_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp
@@ -8,6 +8,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationMishKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.h b/src/ATen/native/xpu/sycl/ActivationMishKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void mish_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void mish_kernel(TensorIteratorBase& iter);
 
-void mish_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void mish_backward_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationPreluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
@@ -40,4 +42,4 @@ void prelu_backward_kernel(TensorIterator& iter) {
       });
 }
 
-} // namespace at::native::xpu
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationPreluKernels.h b/src/ATen/native/xpu/sycl/ActivationPreluKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void prelu_kernel(TensorIterator& iter);
+TORCH_XPU_API void prelu_kernel(TensorIterator& iter);
 
-void prelu_backward_kernel(TensorIterator& iter);
+TORCH_XPU_API void prelu_backward_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationSiluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.h b/src/ATen/native/xpu/sycl/ActivationSiluKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void silu_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void silu_kernel(TensorIteratorBase& iter);
 
-void silu_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void silu_backward_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationSoftplusKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h
@@ -4,12 +4,12 @@
 
 namespace at::native::xpu {
 
-void softplus_kernel(
+TORCH_XPU_API void softplus_kernel(
     TensorIteratorBase& iter,
     const Scalar& beta_,
     const Scalar& threshold_);
 
-void softplus_backward_kernel(
+TORCH_XPU_API void softplus_backward_kernel(
     TensorIteratorBase& iter,
     const Scalar& beta_,
     const Scalar& threshold_);

diff --git a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h
@@ -4,8 +4,12 @@
 
 namespace at::native::xpu {
 
-void softshrink_kernel(TensorIteratorBase& iter, const Scalar& value);
+TORCH_XPU_API void softshrink_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& value);
 
-void softshrink_backward_kernel(TensorIteratorBase& iter, const Scalar& value);
+TORCH_XPU_API void softshrink_backward_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& value);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationThresholdKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {

diff --git a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void threshold_kernel(
+TORCH_XPU_API void threshold_kernel(
     TensorIteratorBase& iter,
     const Scalar& threshold,
     const Scalar& value);

diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
@@ -6,6 +6,8 @@
 #include <comm/MemoryFormat.h>
 #include <vector>
 
+#include <ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h>
+
 namespace at::native::xpu {
 
 using namespace at::xpu;

diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h
@@ -4,12 +4,12 @@
 
 namespace at::native::xpu {
 
-void adaptive_avg_pool2d_backward_kernel(
+TORCH_XPU_API void adaptive_avg_pool2d_backward_kernel(
     Tensor& gradInput,
     const Tensor& gradOutput,
     const Tensor& input);
 
-void adaptive_avg_pool2d_kernel(
+TORCH_XPU_API void adaptive_avg_pool2d_kernel(
     Tensor& output,
     const Tensor& input,
     IntArrayRef output_size);