-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12 from LoopModels/simd
SIMD
- Loading branch information
Showing
68 changed files
with
7,901 additions
and
1,194 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
cmake_minimum_required(VERSION 3.23) | ||
|
||
project(MathBenchmarks LANGUAGES C CXX) | ||
|
||
option(ENABLE_NATIVE_COMPILATION "Compile with -march=native" ON) | ||
option(ENABLE_WIDE_VECTORS "Compile with 512bit vectors if available" ON) | ||
option(ENABLE_OPENMP "Use OpenMP for a multithreading benchmark" OFF) | ||
|
||
# --- Import tools ---- | ||
|
||
include(../cmake/tools.cmake) | ||
|
||
# ---- Dependencies ---- | ||
|
||
include(../cmake/CPM.cmake) | ||
|
||
# ---- compile_commands.json ---- | ||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) | ||
|
||
CPMAddPackage( | ||
NAME benchmark | ||
GITHUB_REPOSITORY google/benchmark | ||
VERSION 1.8.3 | ||
OPTIONS "BENCHMARK_ENABLE_TESTING Off" "BENCHMARK_ENABLE_LIBPFM On" "BENCHMARK_ENABLE_WERROR Off" | ||
"BENCHMARK_ENABLE_EXCEPTIONS Off" SYSTEM TRUE | ||
) | ||
if(benchmark_ADDED) | ||
# enable c++11 to avoid compilation errors | ||
set_target_properties(benchmark PROPERTIES CXX_STANDARD 11) | ||
endif() | ||
|
||
# FetchContent_Declare( Math GIT_REPOSITORY [email protected]:LoopModels/Math.git GIT_TAG origin/main ) | ||
# FetchContent_MakeAvailable(Math) | ||
|
||
# file(GLOB_RECURSE headers CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp) | ||
file(GLOB benchmarks CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) | ||
|
||
add_executable(${PROJECT_NAME} ${benchmarks}) | ||
|
||
message(STATUS "PROJECT_BINARY_DIR: ${PROJECT_BINARY_DIR}") | ||
message(STATUS "PROJECT_SOURCE_DIR: ${PROJECT_SOURCE_DIR}") | ||
message(STATUS "LLVM_INCLUDE_DIRS: ${LLVM_INCLUDE_DIRS}") | ||
|
||
target_include_directories( | ||
${PROJECT_NAME} PRIVATE ${PROJECT_SOURCE_DIR}/../include # ${PROJECT_SOURCE_DIR}/include | ||
) | ||
if(ENABLE_OPENMP) | ||
find_package(OpenMP) | ||
target_link_libraries(${PROJECT_NAME} OpenMP::OpenMP_CXX) | ||
endif() | ||
target_link_libraries(${PROJECT_NAME} PRIVATE benchmark::benchmark_main) | ||
|
||
if((CMAKE_CXX_COMPILER_ID MATCHES "Clang") OR (CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")) | ||
target_compile_options(${PROJECT_NAME} PRIVATE -ferror-limit=2 -fcolor-diagnostics) | ||
elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU") | ||
target_compile_options( | ||
${PROJECT_NAME} PRIVATE -fmax-errors=2 -fconcepts-diagnostics-depth=4 | ||
-fno-semantic-interposition -fdiagnostics-color=always -fverbose-asm | ||
) | ||
endif() | ||
|
||
if(ENABLE_NATIVE_COMPILATION) | ||
if(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") | ||
target_compile_options(${PROJECT_NAME} PRIVATE -xhost) | ||
if(ENABLE_WIDE_VECTORS) | ||
target_compile_options(${PROJECT_NAME} PRIVATE -qopt-zmm-usage=high) | ||
endif() | ||
else() | ||
target_compile_options(${PROJECT_NAME} PRIVATE -march=native) # -fno-unroll-loops) | ||
if(ENABLE_WIDE_VECTORS) | ||
check_cxx_compiler_flag("-mprefer-vector-width=512" VEC512) | ||
if(VEC512) | ||
target_compile_options(${PROJECT_NAME} PRIVATE -mprefer-vector-width=512) | ||
endif() | ||
endif() | ||
endif() | ||
endif() | ||
set_target_properties( | ||
${PROJECT_NAME} | ||
PROPERTIES CXX_STANDARD 23 | ||
CXX_VISIBILITY_PRESET hidden | ||
VISIBILITY_INLINES_HIDDEN ON | ||
) | ||
set_target_properties( | ||
${PROJECT_NAME} PROPERTIES ENVIRONMENT WORKING_DIRECTORY=${PROJECT_BINARY_DIR} | ||
) | ||
|
||
target_compile_options( | ||
${PROJECT_NAME} | ||
PRIVATE -fno-exceptions | ||
-fno-rtti | ||
-fstrict-aliasing | ||
-fno-plt | ||
-fstrict-overflow | ||
-fomit-frame-pointer | ||
-fno-signed-zeros | ||
-fassociative-math | ||
-ffinite-math-only | ||
-funsafe-math-optimizations | ||
-fno-trapping-math | ||
-Wall | ||
-Wshadow | ||
-Wextra | ||
-save-temps | ||
-Werror | ||
) | ||
if(ENABLE_OPENMP) | ||
if(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") | ||
target_compile_options(${PROJECT_NAME} PRIVATE -fiopenmp) | ||
else() | ||
target_compile_options(${PROJECT_NAME} PRIVATE -fopenmp) | ||
endif() | ||
else() | ||
target_compile_options(${PROJECT_NAME} PRIVATE -fopenmp-simd) | ||
endif() | ||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") | ||
target_compile_options(${PROJECT_NAME} PRIVATE -masm=intel) | ||
endif() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
|
||
#include "include/randdual.hpp" | ||
#include <Math/Array.hpp> | ||
#include <Math/Dual.hpp> | ||
#include <Math/LinearAlgebra.hpp> | ||
#include <Math/Matrix.hpp> | ||
#include <Math/StaticArrays.hpp> | ||
#include <Utilities/Invariant.hpp> | ||
#include <algorithm> | ||
#include <array> | ||
#include <benchmark/benchmark.h> | ||
#include <concepts> | ||
#include <cstdint> | ||
#include <random> | ||
#include <ranges> | ||
|
||
using poly::math::Dual, poly::math::SquareMatrix, poly::math::SquareDims, | ||
poly::math::I, poly::math::URand; | ||
|
||
[[gnu::noinline]] void prod(auto &c, const auto &a, const auto &b) { | ||
c = a * b; | ||
} | ||
|
||
static void BM_dual8x2prod(benchmark::State &state) { | ||
std::mt19937_64 rng0; | ||
using D = Dual<Dual<double, 8>, 2>; | ||
D a = URand<D>{}(rng0), b = URand<D>{}(rng0), c; | ||
for (auto _ : state) { | ||
prod(c, a, b); | ||
benchmark::DoNotOptimize(c); | ||
} | ||
} | ||
BENCHMARK(BM_dual8x2prod); | ||
|
||
template <typename T, ptrdiff_t N, bool SIMDArry = false> struct ManualDual { | ||
T value; | ||
poly::math::SVector<T, N> partials; | ||
}; | ||
|
||
template <std::floating_point T, ptrdiff_t N> struct ManualDual<T, N, false> { | ||
T value; | ||
poly::simd::Vec<N, T> partials; | ||
}; | ||
template <std::floating_point T, ptrdiff_t N> struct ManualDual<T, N, true> { | ||
T value; | ||
poly::math::StaticArray<T, 1, N, false> partials; | ||
}; | ||
template <typename T, ptrdiff_t N, bool B> | ||
[[gnu::always_inline]] constexpr auto operator*(const ManualDual<T, N, B> &a, | ||
const ManualDual<T, N, B> &b) | ||
-> ManualDual<T, N, B> { | ||
return {a.value * b.value, a.value * b.partials + b.value * a.partials}; | ||
} | ||
template <typename T, ptrdiff_t N, bool B> | ||
[[gnu::always_inline]] constexpr auto operator*(const ManualDual<T, N, B> &a, | ||
const T &b) | ||
-> ManualDual<T, N, B> { | ||
return {a.value * b, b * a.partials}; | ||
} | ||
template <typename T, ptrdiff_t N, bool B> | ||
[[gnu::always_inline]] constexpr auto operator*(const T &a, | ||
const ManualDual<T, N, B> &b) | ||
-> ManualDual<T, N, B> { | ||
return {b.value * a, a * b.partials}; | ||
} | ||
template <typename T, ptrdiff_t N, bool B> | ||
[[gnu::always_inline]] constexpr auto operator+(const ManualDual<T, N, B> &a, | ||
const ManualDual<T, N, B> &b) | ||
-> ManualDual<T, N, B> { | ||
return {a.value + b.value, a.partials + b.partials}; | ||
} | ||
template <typename T, ptrdiff_t N, bool B> | ||
[[gnu::always_inline]] constexpr auto operator+(const ManualDual<T, N, B> &a, | ||
const T &b) | ||
-> ManualDual<T, N, B> { | ||
return {a.value + b, a.partials}; | ||
} | ||
template <typename T, ptrdiff_t N, bool B> | ||
[[gnu::always_inline]] constexpr auto operator+(const T &a, | ||
const ManualDual<T, N, B> &b) | ||
-> ManualDual<T, N, B> { | ||
return {b.value + a, b.partials}; | ||
} | ||
// template <typename T, ptrdiff_t M, ptrdiff_t N> | ||
// [[gnu::noinline]] void prod_manual(ManualDual<ManualDual<T, M>, N> &c, | ||
// const ManualDual<ManualDual<T, M>, N> &a, | ||
// const ManualDual<ManualDual<T, M>, N> &b) | ||
// { | ||
// // return {val * other.val, val * other.partials + other.val * partials}; | ||
// c.value= a.value* b.value; | ||
// c.partials = a.value* b.partials+ b.value* a.partials; | ||
// } | ||
|
||
template <ptrdiff_t M, ptrdiff_t N, bool SIMDArray> auto setup_manual() { | ||
using D = ManualDual<ManualDual<double, M, SIMDArray>, N>; | ||
std::mt19937_64 rng0; | ||
D a{}, b{}, c{}; | ||
a.value.value = URand<double>{}(rng0); | ||
b.value.value = URand<double>{}(rng0); | ||
for (ptrdiff_t j = 0; j < M; ++j) { | ||
a.value.partials[j] = URand<double>{}(rng0); | ||
b.value.partials[j] = URand<double>{}(rng0); | ||
} | ||
for (ptrdiff_t i = 0; i < N; ++i) { | ||
a.partials[i].value = URand<double>{}(rng0); | ||
b.partials[i].value = URand<double>{}(rng0); | ||
for (ptrdiff_t j = 0; j < M; ++j) { | ||
a.partials[i].partials[j] = URand<double>{}(rng0); | ||
b.partials[i].partials[j] = URand<double>{}(rng0); | ||
} | ||
} | ||
return std::array<D, 3>{a, b, c}; | ||
} | ||
|
||
static void BM_dual8x2prod_manual(benchmark::State &state) { | ||
auto [a, b, c] = setup_manual<8, 2, false>(); | ||
for (auto _ : state) { | ||
prod(c, a, b); | ||
benchmark::DoNotOptimize(c); | ||
} | ||
} | ||
BENCHMARK(BM_dual8x2prod_manual); | ||
|
||
static void BM_dual8x2prod_simdarray(benchmark::State &state) { | ||
auto [a, b, c] = setup_manual<8, 2, true>(); | ||
for (auto _ : state) { | ||
prod(c, a, b); | ||
benchmark::DoNotOptimize(c); | ||
} | ||
} | ||
BENCHMARK(BM_dual8x2prod_simdarray); | ||
|
||
static void BM_dual7x2prod(benchmark::State &state) { | ||
std::mt19937_64 rng0; | ||
using D = Dual<Dual<double, 7>, 2>; | ||
static_assert(std::same_as<double, poly::math::scalarize_elt_cast_t< | ||
Dual<Dual<double, 7, true>, 2, true>>>); | ||
// static_assert(sizeof(D) == sizeof(Dual<Dual<double, 8>, 2>)); | ||
static_assert(poly::utils::Compressible<D>); | ||
D a = URand<D>{}(rng0), b = URand<D>{}(rng0), c; | ||
for (auto _ : state) { | ||
prod(c, a, b); | ||
benchmark::DoNotOptimize(c); | ||
} | ||
} | ||
BENCHMARK(BM_dual7x2prod); | ||
|
||
static void BM_dual7x2prod_manual(benchmark::State &state) { | ||
auto [a, b, c] = setup_manual<7, 2, false>(); | ||
for (auto _ : state) { | ||
prod(c, a, b); | ||
benchmark::DoNotOptimize(c); | ||
} | ||
} | ||
BENCHMARK(BM_dual7x2prod_manual); | ||
|
||
static void BM_dual7x2prod_simdarray(benchmark::State &state) { | ||
auto [a, b, c] = setup_manual<7, 2, true>(); | ||
for (auto _ : state) { | ||
prod(c, a, b); | ||
benchmark::DoNotOptimize(c); | ||
} | ||
} | ||
BENCHMARK(BM_dual7x2prod_simdarray); | ||
|
||
static void BM_dual6x2prod(benchmark::State &state) { | ||
std::mt19937_64 rng0; | ||
using D = Dual<Dual<double, 6>, 2>; | ||
// static_assert(sizeof(D) == sizeof(Dual<Dual<double, 8>, 2>)); | ||
static_assert(poly::utils::Compressible<D>); | ||
D a = URand<D>{}(rng0), b = URand<D>{}(rng0), c; | ||
for (auto _ : state) { | ||
prod(c, a, b); | ||
benchmark::DoNotOptimize(c); | ||
} | ||
} | ||
BENCHMARK(BM_dual6x2prod); | ||
|
||
static void BM_dual6x2prod_manual(benchmark::State &state) { | ||
auto [a, b, c] = setup_manual<6, 2, false>(); | ||
for (auto _ : state) { | ||
prod(c, a, b); | ||
benchmark::DoNotOptimize(c); | ||
} | ||
} | ||
BENCHMARK(BM_dual6x2prod_manual); | ||
|
||
static void BM_dual6x2prod_simdarray(benchmark::State &state) { | ||
auto [a, b, c] = setup_manual<6, 2, true>(); | ||
for (auto _ : state) { | ||
prod(c, a, b); | ||
benchmark::DoNotOptimize(c); | ||
} | ||
} | ||
BENCHMARK(BM_dual6x2prod_simdarray); |
Oops, something went wrong.