Skip to content

Commit

Permalink
Merge pull request #12 from LoopModels/simd
Browse files Browse the repository at this point in the history
SIMD
  • Loading branch information
chriselrod authored Nov 27, 2023
2 parents 8e08ae3 + 1eef35b commit f332351
Show file tree
Hide file tree
Showing 68 changed files with 7,901 additions and 1,194 deletions.
17 changes: 16 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
all: clangnosan clangsan gccnosan gccsan
all: clangnosan clangsan gccnosan gccsan gccavx2 clangavx512
#TODO: re-enable GCC once multidimensional indexing in `requires` is fixed:
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111493

Expand All @@ -13,6 +13,13 @@ buildclang/nosan/:

buildclang/test/:
CXXFLAGS="" CXX=clang++ cmake -S test -B buildclang/test/ -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZER='Address;Undefined'

buildgcc/avx2/:
CXXFLAGS="-march=haswell" CXX=g++ cmake -S test -B buildgcc/avx2/ -DCMAKE_BUILD_TYPE=Debug

buildclang/avx512/:
CXXFLAGS="-march=skylake-avx512 -mprefer-vector-width=512" CXX=clang++ cmake -S test -B buildclang/avx512/ -DCMAKE_BUILD_TYPE=Debug


gccnosan: buildgcc/nosan/
cmake --build buildgcc/nosan/
Expand All @@ -30,5 +37,13 @@ clangsan: buildclang/test/
cmake --build buildclang/test/
cmake --build buildclang/test/ --target test

gccavx2: buildgcc/avx2/
cmake --build buildgcc/avx2/
cmake --build buildgcc/avx2/ --target test

clangavx512: buildclang/avx512/
cmake --build buildclang/avx512/
cmake --build buildclang/avx512/ --target test

clean:
rm -r buildclang #buildgcc
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,7 @@ Copying, e.g. from an expression template to an array, is done via `destination
`operator=` is reserved for copying the actual objects, rather than referenced memory, as this seems to be the behavior C++ views.
Thus if `A` and `B` are views, `A = B` will make `A` the same view as `B`, while `A << B` will copy memory from `B` to `A`.

Vectors are interpreted as row vectors by default. `v.t()` or `transpose(v)` may be used to transpose.
`A[_,i]` is a column-vector, `A[i,_]` a row-vector.

This repository was created using the [ModernCppStarter](https://github.com/TheLartians/ModernCppStarter).
118 changes: 118 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
cmake_minimum_required(VERSION 3.23)

project(MathBenchmarks LANGUAGES C CXX)

option(ENABLE_NATIVE_COMPILATION "Compile with -march=native" ON)
option(ENABLE_WIDE_VECTORS "Compile with 512bit vectors if available" ON)
option(ENABLE_OPENMP "Use OpenMP for a multithreading benchmark" OFF)

# --- Import tools ----

include(../cmake/tools.cmake)

# ---- Dependencies ----

include(../cmake/CPM.cmake)

# ---- compile_commands.json ----
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

CPMAddPackage(
NAME benchmark
GITHUB_REPOSITORY google/benchmark
VERSION 1.8.3
OPTIONS "BENCHMARK_ENABLE_TESTING Off" "BENCHMARK_ENABLE_LIBPFM On" "BENCHMARK_ENABLE_WERROR Off"
"BENCHMARK_ENABLE_EXCEPTIONS Off" SYSTEM TRUE
)
if(benchmark_ADDED)
# enable c++11 to avoid compilation errors
set_target_properties(benchmark PROPERTIES CXX_STANDARD 11)
endif()

# FetchContent_Declare( Math GIT_REPOSITORY [email protected]:LoopModels/Math.git GIT_TAG origin/main )
# FetchContent_MakeAvailable(Math)

# file(GLOB_RECURSE headers CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp)
file(GLOB benchmarks CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)

add_executable(${PROJECT_NAME} ${benchmarks})

message(STATUS "PROJECT_BINARY_DIR: ${PROJECT_BINARY_DIR}")
message(STATUS "PROJECT_SOURCE_DIR: ${PROJECT_SOURCE_DIR}")
message(STATUS "LLVM_INCLUDE_DIRS: ${LLVM_INCLUDE_DIRS}")

target_include_directories(
${PROJECT_NAME} PRIVATE ${PROJECT_SOURCE_DIR}/../include # ${PROJECT_SOURCE_DIR}/include
)
if(ENABLE_OPENMP)
find_package(OpenMP)
target_link_libraries(${PROJECT_NAME} OpenMP::OpenMP_CXX)
endif()
target_link_libraries(${PROJECT_NAME} PRIVATE benchmark::benchmark_main)

if((CMAKE_CXX_COMPILER_ID MATCHES "Clang") OR (CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM"))
target_compile_options(${PROJECT_NAME} PRIVATE -ferror-limit=2 -fcolor-diagnostics)
elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
target_compile_options(
${PROJECT_NAME} PRIVATE -fmax-errors=2 -fconcepts-diagnostics-depth=4
-fno-semantic-interposition -fdiagnostics-color=always -fverbose-asm
)
endif()

if(ENABLE_NATIVE_COMPILATION)
if(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
target_compile_options(${PROJECT_NAME} PRIVATE -xhost)
if(ENABLE_WIDE_VECTORS)
target_compile_options(${PROJECT_NAME} PRIVATE -qopt-zmm-usage=high)
endif()
else()
target_compile_options(${PROJECT_NAME} PRIVATE -march=native) # -fno-unroll-loops)
if(ENABLE_WIDE_VECTORS)
check_cxx_compiler_flag("-mprefer-vector-width=512" VEC512)
if(VEC512)
target_compile_options(${PROJECT_NAME} PRIVATE -mprefer-vector-width=512)
endif()
endif()
endif()
endif()
set_target_properties(
${PROJECT_NAME}
PROPERTIES CXX_STANDARD 23
CXX_VISIBILITY_PRESET hidden
VISIBILITY_INLINES_HIDDEN ON
)
set_target_properties(
${PROJECT_NAME} PROPERTIES ENVIRONMENT WORKING_DIRECTORY=${PROJECT_BINARY_DIR}
)

target_compile_options(
${PROJECT_NAME}
PRIVATE -fno-exceptions
-fno-rtti
-fstrict-aliasing
-fno-plt
-fstrict-overflow
-fomit-frame-pointer
-fno-signed-zeros
-fassociative-math
-ffinite-math-only
-funsafe-math-optimizations
-fno-trapping-math
-Wall
-Wshadow
-Wextra
-save-temps
-Werror
)
if(ENABLE_OPENMP)
if(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
target_compile_options(${PROJECT_NAME} PRIVATE -fiopenmp)
else()
target_compile_options(${PROJECT_NAME} PRIVATE -fopenmp)
endif()
else()
target_compile_options(${PROJECT_NAME} PRIVATE -fopenmp-simd)
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
target_compile_options(${PROJECT_NAME} PRIVATE -masm=intel)
endif()
195 changes: 195 additions & 0 deletions benchmark/dual_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@

#include "include/randdual.hpp"
#include <Math/Array.hpp>
#include <Math/Dual.hpp>
#include <Math/LinearAlgebra.hpp>
#include <Math/Matrix.hpp>
#include <Math/StaticArrays.hpp>
#include <Utilities/Invariant.hpp>
#include <algorithm>
#include <array>
#include <benchmark/benchmark.h>
#include <concepts>
#include <cstdint>
#include <random>
#include <ranges>

using poly::math::Dual, poly::math::SquareMatrix, poly::math::SquareDims,
poly::math::I, poly::math::URand;

[[gnu::noinline]] void prod(auto &c, const auto &a, const auto &b) {
c = a * b;
}

static void BM_dual8x2prod(benchmark::State &state) {
std::mt19937_64 rng0;
using D = Dual<Dual<double, 8>, 2>;
D a = URand<D>{}(rng0), b = URand<D>{}(rng0), c;
for (auto _ : state) {
prod(c, a, b);
benchmark::DoNotOptimize(c);
}
}
BENCHMARK(BM_dual8x2prod);

template <typename T, ptrdiff_t N, bool SIMDArry = false> struct ManualDual {
T value;
poly::math::SVector<T, N> partials;
};

template <std::floating_point T, ptrdiff_t N> struct ManualDual<T, N, false> {
T value;
poly::simd::Vec<N, T> partials;
};
template <std::floating_point T, ptrdiff_t N> struct ManualDual<T, N, true> {
T value;
poly::math::StaticArray<T, 1, N, false> partials;
};
template <typename T, ptrdiff_t N, bool B>
[[gnu::always_inline]] constexpr auto operator*(const ManualDual<T, N, B> &a,
const ManualDual<T, N, B> &b)
-> ManualDual<T, N, B> {
return {a.value * b.value, a.value * b.partials + b.value * a.partials};
}
template <typename T, ptrdiff_t N, bool B>
[[gnu::always_inline]] constexpr auto operator*(const ManualDual<T, N, B> &a,
const T &b)
-> ManualDual<T, N, B> {
return {a.value * b, b * a.partials};
}
template <typename T, ptrdiff_t N, bool B>
[[gnu::always_inline]] constexpr auto operator*(const T &a,
const ManualDual<T, N, B> &b)
-> ManualDual<T, N, B> {
return {b.value * a, a * b.partials};
}
template <typename T, ptrdiff_t N, bool B>
[[gnu::always_inline]] constexpr auto operator+(const ManualDual<T, N, B> &a,
const ManualDual<T, N, B> &b)
-> ManualDual<T, N, B> {
return {a.value + b.value, a.partials + b.partials};
}
template <typename T, ptrdiff_t N, bool B>
[[gnu::always_inline]] constexpr auto operator+(const ManualDual<T, N, B> &a,
const T &b)
-> ManualDual<T, N, B> {
return {a.value + b, a.partials};
}
template <typename T, ptrdiff_t N, bool B>
[[gnu::always_inline]] constexpr auto operator+(const T &a,
const ManualDual<T, N, B> &b)
-> ManualDual<T, N, B> {
return {b.value + a, b.partials};
}
// template <typename T, ptrdiff_t M, ptrdiff_t N>
// [[gnu::noinline]] void prod_manual(ManualDual<ManualDual<T, M>, N> &c,
// const ManualDual<ManualDual<T, M>, N> &a,
// const ManualDual<ManualDual<T, M>, N> &b)
// {
// // return {val * other.val, val * other.partials + other.val * partials};
// c.value= a.value* b.value;
// c.partials = a.value* b.partials+ b.value* a.partials;
// }

template <ptrdiff_t M, ptrdiff_t N, bool SIMDArray> auto setup_manual() {
using D = ManualDual<ManualDual<double, M, SIMDArray>, N>;
std::mt19937_64 rng0;
D a{}, b{}, c{};
a.value.value = URand<double>{}(rng0);
b.value.value = URand<double>{}(rng0);
for (ptrdiff_t j = 0; j < M; ++j) {
a.value.partials[j] = URand<double>{}(rng0);
b.value.partials[j] = URand<double>{}(rng0);
}
for (ptrdiff_t i = 0; i < N; ++i) {
a.partials[i].value = URand<double>{}(rng0);
b.partials[i].value = URand<double>{}(rng0);
for (ptrdiff_t j = 0; j < M; ++j) {
a.partials[i].partials[j] = URand<double>{}(rng0);
b.partials[i].partials[j] = URand<double>{}(rng0);
}
}
return std::array<D, 3>{a, b, c};
}

static void BM_dual8x2prod_manual(benchmark::State &state) {
auto [a, b, c] = setup_manual<8, 2, false>();
for (auto _ : state) {
prod(c, a, b);
benchmark::DoNotOptimize(c);
}
}
BENCHMARK(BM_dual8x2prod_manual);

static void BM_dual8x2prod_simdarray(benchmark::State &state) {
auto [a, b, c] = setup_manual<8, 2, true>();
for (auto _ : state) {
prod(c, a, b);
benchmark::DoNotOptimize(c);
}
}
BENCHMARK(BM_dual8x2prod_simdarray);

static void BM_dual7x2prod(benchmark::State &state) {
std::mt19937_64 rng0;
using D = Dual<Dual<double, 7>, 2>;
static_assert(std::same_as<double, poly::math::scalarize_elt_cast_t<
Dual<Dual<double, 7, true>, 2, true>>>);
// static_assert(sizeof(D) == sizeof(Dual<Dual<double, 8>, 2>));
static_assert(poly::utils::Compressible<D>);
D a = URand<D>{}(rng0), b = URand<D>{}(rng0), c;
for (auto _ : state) {
prod(c, a, b);
benchmark::DoNotOptimize(c);
}
}
BENCHMARK(BM_dual7x2prod);

static void BM_dual7x2prod_manual(benchmark::State &state) {
auto [a, b, c] = setup_manual<7, 2, false>();
for (auto _ : state) {
prod(c, a, b);
benchmark::DoNotOptimize(c);
}
}
BENCHMARK(BM_dual7x2prod_manual);

static void BM_dual7x2prod_simdarray(benchmark::State &state) {
auto [a, b, c] = setup_manual<7, 2, true>();
for (auto _ : state) {
prod(c, a, b);
benchmark::DoNotOptimize(c);
}
}
BENCHMARK(BM_dual7x2prod_simdarray);

static void BM_dual6x2prod(benchmark::State &state) {
std::mt19937_64 rng0;
using D = Dual<Dual<double, 6>, 2>;
// static_assert(sizeof(D) == sizeof(Dual<Dual<double, 8>, 2>));
static_assert(poly::utils::Compressible<D>);
D a = URand<D>{}(rng0), b = URand<D>{}(rng0), c;
for (auto _ : state) {
prod(c, a, b);
benchmark::DoNotOptimize(c);
}
}
BENCHMARK(BM_dual6x2prod);

static void BM_dual6x2prod_manual(benchmark::State &state) {
auto [a, b, c] = setup_manual<6, 2, false>();
for (auto _ : state) {
prod(c, a, b);
benchmark::DoNotOptimize(c);
}
}
BENCHMARK(BM_dual6x2prod_manual);

static void BM_dual6x2prod_simdarray(benchmark::State &state) {
auto [a, b, c] = setup_manual<6, 2, true>();
for (auto _ : state) {
prod(c, a, b);
benchmark::DoNotOptimize(c);
}
}
BENCHMARK(BM_dual6x2prod_simdarray);
Loading

0 comments on commit f332351

Please sign in to comment.