Skip to content

Commit

Permalink
Add Arm SVE 8-wide (256b) implementation (#480)
Browse files Browse the repository at this point in the history
This PR adds support for Arm SVE via a fixed-width 256-bit implementation, as well as extending the fixed-width 128-bit implementation (which is mostly NEON) with a few targeted SVE operations such as native gathers.

Due to the style of 128-bit accumulator we use for floating-point invariance, it's not really possible to write a true vector-length-agnostic SVE implementation, so this implementation is a compile-time choice that will only work on 256b SVE implementations.

On a Neoverse V1 this code is ~30% faster than the equivalent NEON build.
  • Loading branch information
solidpixel authored Aug 5, 2024
1 parent 8bc51bc commit 213d6c2
Show file tree
Hide file tree
Showing 19 changed files with 1,326 additions and 181 deletions.
37 changes: 22 additions & 15 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ include(CTest)
option(ASTCENC_ISA_AVX2 "Enable astcenc builds for AVX2 SIMD")
option(ASTCENC_ISA_SSE41 "Enable astcenc builds for SSE4.1 SIMD")
option(ASTCENC_ISA_SSE2 "Enable astcenc builds for SSE2 SIMD")
option(ASTCENC_ISA_SVE_256 "Enable astcenc builds for 256-bit SVE SIMD")
option(ASTCENC_ISA_NEON "Enable astcenc builds for NEON SIMD")
option(ASTCENC_ISA_NONE "Enable astcenc builds for no SIMD")
option(ASTCENC_ISA_NATIVE "Enable astcenc builds for native SIMD")
Expand Down Expand Up @@ -86,7 +87,7 @@ endforeach()

# Count options which MUST be arm64
set(ASTCENC_ARM64_ISA_COUNT 0)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NEON})
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_SVE_256})
foreach(ASTCENC_CONFIG ${ASTCENC_CONFIGS})
if(${ASTCENC_CONFIG})
math(EXPR ASTCENC_ARM64_ISA_COUNT "${ASTCENC_ARM64_ISA_COUNT} + 1")
Expand Down Expand Up @@ -117,22 +118,28 @@ if("${ASTCENC_BLOCK_MAX_TEXELS}")
message(STATUS " Max block texels - ${ASTCENC_BLOCK_MAX_TEXELS}")
endif()

printopt("AVX2 backend " ${ASTCENC_ISA_AVX2})
printopt("SSE4.1 backend " ${ASTCENC_ISA_SSE41})
printopt("SSE2 backend " ${ASTCENC_ISA_SSE2})
printopt("NEON backend " ${ASTCENC_ISA_NEON})
printopt("NONE backend " ${ASTCENC_ISA_NONE})
printopt("NATIVE backend " ${ASTCENC_ISA_NATIVE})
message(STATUS "Arm backend options")
printopt("SVE 256b backend " ${ASTCENC_ISA_SVE_256})
printopt("NEON backend " ${ASTCENC_ISA_NEON})
message(STATUS "x86-64 backend options")
printopt("AVX2 backend " ${ASTCENC_ISA_AVX2})
printopt("SSE4.1 backend " ${ASTCENC_ISA_SSE41})
printopt("SSE2 backend " ${ASTCENC_ISA_SSE2})
message(STATUS "Agnostic backend options")
printopt("NONE backend " ${ASTCENC_ISA_NONE})
printopt("NATIVE backend " ${ASTCENC_ISA_NATIVE})
message(STATUS "Build options")
if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
printopt("Universal bin " ${ASTCENC_UNIVERSAL_BUILD})
printopt("Universal bin " ${ASTCENC_UNIVERSAL_BUILD})
endif()
printopt("Invariance " ${ASTCENC_INVARIANCE})
printopt("Shared libs " ${ASTCENC_SHAREDLIB})
printopt("Decompressor " ${ASTCENC_DECOMPRESSOR})
printopt("Diagnostics " ${ASTCENC_DIAGNOSTICS})
printopt("ASAN " ${ASTCENC_ASAN})
printopt("UBSAN " ${ASTCENC_UBSAN})
printopt("Unit tests " ${ASTCENC_UNITTEST})
printopt("Invariance " ${ASTCENC_INVARIANCE})
printopt("Shared libs " ${ASTCENC_SHAREDLIB})
printopt("Decompressor " ${ASTCENC_DECOMPRESSOR})
message(STATUS "Developer options")
printopt("Diagnostics " ${ASTCENC_DIAGNOSTICS})
printopt("ASAN " ${ASTCENC_ASAN})
printopt("UBSAN " ${ASTCENC_UBSAN})
printopt("Unit tests " ${ASTCENC_UNITTEST})

# Subcomponents
add_subdirectory(Source)
Expand Down
8 changes: 5 additions & 3 deletions Source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ else()
set(ASTCENC_CODEC enc)
endif()

set(ASTCENC_ARTIFACTS native none neon avx2 sse4.1 sse2)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
set(ASTCENC_ARTIFACTS native none sve_256 neon avx2 sse4.1 sse2)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")

Expand All @@ -38,7 +38,9 @@ foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
if(${ASTCENC_CONFIG})
set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})

if(${ASTCENC_ISA_SIMD} MATCHES "neon")
if(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
# Not suported on macOS
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
set(CMAKE_OSX_ARCHITECTURES arm64)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
set(CMAKE_OSX_ARCHITECTURES x86_64)
Expand Down
8 changes: 5 additions & 3 deletions Source/UnitTest/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
# under the License.
# ----------------------------------------------------------------------------

set(ASTCENC_ARTIFACTS native none neon avx2 sse4.1 sse2)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
set(ASTCENC_ARTIFACTS native none sve_256 neon avx2 sse4.1 sse2)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")

Expand All @@ -26,7 +26,9 @@ foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
if(${ASTCENC_CONFIG})
set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})

if(${ASTCENC_ISA_SIMD} MATCHES "neon")
if(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
# Not supported on macOS
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
set(CMAKE_OSX_ARCHITECTURES arm64)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
set(CMAKE_OSX_ARCHITECTURES x86_64)
Expand Down
22 changes: 21 additions & 1 deletion Source/UnitTest/cmake_core.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# ----------------------------------------------------------------------------
# Copyright 2020-2023 Arm Limited
# Copyright 2020-2024 Arm Limited
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy
Expand Down Expand Up @@ -72,6 +72,7 @@ if(${ASTCENC_ISA_SIMD} MATCHES "none")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SVE=0
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
Expand All @@ -81,15 +82,32 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=1
ASTCENC_SVE=0
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)

elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=1
ASTCENC_SVE=8
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)

# Enable SVE
target_compile_options(${ASTCENC_TEST}
PRIVATE
-march=armv8-a+sve -msve-vector-bits=256)

elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SVE=0
ASTCENC_SSE=20
ASTCENC_AVX=0
ASTCENC_POPCNT=0
Expand All @@ -103,6 +121,7 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SVE=0
ASTCENC_SSE=41
ASTCENC_AVX=0
ASTCENC_POPCNT=1
Expand All @@ -116,6 +135,7 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SVE=0
ASTCENC_SSE=41
ASTCENC_AVX=2
ASTCENC_POPCNT=1
Expand Down
104 changes: 26 additions & 78 deletions Source/UnitTest/test_simd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ TEST(vfloat, ChangeSign)
vfloat4 a(-1.0f, 1.0f, -3.12f, 3.12f);
vfloat4 b(-1.0f, -1.0f, 3.12f, 3.12f);
vfloat4 r = change_sign(a, b);

EXPECT_EQ(r.lane<0>(), 1.0f);
EXPECT_EQ(r.lane<1>(), -1.0f);
EXPECT_EQ(r.lane<2>(), -3.12f);
Expand All @@ -205,6 +206,7 @@ TEST(vfloat, Atan)
{
vfloat4 a(-0.15f, 0.0f, 0.9f, 2.1f);
vfloat4 r = atan(a);

EXPECT_NEAR(r.lane<0>(), -0.149061f, 0.005f);
EXPECT_NEAR(r.lane<1>(), 0.000000f, 0.005f);
EXPECT_NEAR(r.lane<2>(), 0.733616f, 0.005f);
Expand All @@ -217,6 +219,7 @@ TEST(vfloat, Atan2)
vfloat4 a(-0.15f, 0.0f, 0.9f, 2.1f);
vfloat4 b(1.15f, -3.0f, -0.9f, 1.1f);
vfloat4 r = atan2(a, b);

EXPECT_NEAR(r.lane<0>(), -0.129816f, 0.005f);
EXPECT_NEAR(r.lane<1>(), 3.141592f, 0.005f);
EXPECT_NEAR(r.lane<2>(), 2.360342f, 0.005f);
Expand Down Expand Up @@ -909,31 +912,6 @@ TEST(vfloat4, select)
EXPECT_EQ(r2.lane<3>(), 4.0f);
}

/** @brief Test vfloat4 select MSB only. */
TEST(vfloat4, select_msb)
{
int msb_set = static_cast<int>(0x80000000);
vint4 msb(msb_set, 0, msb_set, 0);
vmask4 cond(msb.m);

vfloat4 a(1.0f, 3.0f, 3.0f, 1.0f);
vfloat4 b(4.0f, 2.0f, 2.0f, 4.0f);

// Select in one direction
vfloat4 r1 = select_msb(a, b, cond);
EXPECT_EQ(r1.lane<0>(), 4.0f);
EXPECT_EQ(r1.lane<1>(), 3.0f);
EXPECT_EQ(r1.lane<2>(), 2.0f);
EXPECT_EQ(r1.lane<3>(), 1.0f);

// Select in the other
vfloat4 r2 = select_msb(b, a, cond);
EXPECT_EQ(r2.lane<0>(), 1.0f);
EXPECT_EQ(r2.lane<1>(), 2.0f);
EXPECT_EQ(r2.lane<2>(), 3.0f);
EXPECT_EQ(r2.lane<3>(), 4.0f);
}

/** @brief Test vfloat4 gatherf. */
TEST(vfloat4, gatherf)
{
Expand Down Expand Up @@ -1839,12 +1817,17 @@ TEST(vint4, store_lanes_masked_unaligned)
EXPECT_TRUE(all(result3v == expect3v));
}

/** @brief Test vint4 pack_low_bytes. */
TEST(vint4, pack_low_bytes)
/** @brief Test vint4 pack_and_store_low_bytes. */
TEST(vint4, pack_and_store_low_bytes)
{
vint4 a(1, 2, 3, 4);
vint4 r = pack_low_bytes(a);
EXPECT_EQ(r.lane<0>(), (4 << 24) | (3 << 16) | (2 << 8) | (1 << 0));
uint8_t bytes[4] { 0 };
pack_and_store_low_bytes(a, bytes);

EXPECT_EQ(bytes[0], 1);
EXPECT_EQ(bytes[1], 2);
EXPECT_EQ(bytes[2], 3);
EXPECT_EQ(bytes[3], 4);
}

/** @brief Test vint4 select. */
Expand Down Expand Up @@ -2711,46 +2694,6 @@ TEST(vfloat8, select)
EXPECT_EQ(ra[7], 4.0f);
}

/** @brief Test vfloat8 select MSB only. */
TEST(vfloat8, select_msb)
{
int msb_set = static_cast<int>(0x80000000);
vint8 msb = vint8_lit(msb_set, 0, msb_set, 0, msb_set, 0, msb_set, 0);
vmask8 cond(msb.m);

vfloat8 a = vfloat8_lit(1.0f, 3.0f, 3.0f, 1.0f, 1.0f, 3.0f, 3.0f, 1.0f);
vfloat8 b = vfloat8_lit(4.0f, 2.0f, 2.0f, 4.0f, 4.0f, 2.0f, 2.0f, 4.0f);

// Select in one direction
vfloat8 r1 = select(a, b, cond);

alignas(32) float ra[8];
storea(r1, ra);

EXPECT_EQ(ra[0], 4.0f);
EXPECT_EQ(ra[1], 3.0f);
EXPECT_EQ(ra[2], 2.0f);
EXPECT_EQ(ra[3], 1.0f);
EXPECT_EQ(ra[4], 4.0f);
EXPECT_EQ(ra[5], 3.0f);
EXPECT_EQ(ra[6], 2.0f);
EXPECT_EQ(ra[7], 1.0f);

// Select in the other
vfloat8 r2 = select(b, a, cond);

storea(r2, ra);

EXPECT_EQ(ra[0], 1.0f);
EXPECT_EQ(ra[1], 2.0f);
EXPECT_EQ(ra[2], 3.0f);
EXPECT_EQ(ra[3], 4.0f);
EXPECT_EQ(ra[4], 1.0f);
EXPECT_EQ(ra[5], 2.0f);
EXPECT_EQ(ra[6], 3.0f);
EXPECT_EQ(ra[7], 4.0f);
}

/** @brief Test vfloat8 gatherf. */
TEST(vfloat8, gatherf)
{
Expand Down Expand Up @@ -3583,17 +3526,22 @@ TEST(vint8, store_lanes_masked_unaligned)
EXPECT_TRUE(all(result3v == expect3v));
}

/** @brief Test vint8 pack_low_bytes. */
TEST(vint8, pack_low_bytes)
/** @brief Test vint8 pack_and_store_low_bytes. */
TEST(vint8, pack_and_store_low_bytes)
{
vint8 a = vint8_lit(1, 2, 3, 4, 2, 3, 4, 5);
vint8 r = pack_low_bytes(a);

alignas(32) int ra[8];
store(r, ra);

EXPECT_EQ(ra[0], (4 << 24) | (3 << 16) | (2 << 8) | (1 << 0));
EXPECT_EQ(ra[1], (5 << 24) | (4 << 16) | (3 << 8) | (2 << 0));
uint8_t bytes[8] { 0 };

pack_and_store_low_bytes(a, bytes);

EXPECT_EQ(bytes[0], 1);
EXPECT_EQ(bytes[1], 2);
EXPECT_EQ(bytes[2], 3);
EXPECT_EQ(bytes[3], 4);
EXPECT_EQ(bytes[4], 2);
EXPECT_EQ(bytes[5], 3);
EXPECT_EQ(bytes[6], 4);
EXPECT_EQ(bytes[7], 5);
}

/** @brief Test vint8 select. */
Expand Down
6 changes: 2 additions & 4 deletions Source/astcenc_ideal_endpoints_and_weights.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1050,8 +1050,7 @@ void compute_quantized_weights_for_decimation(

// Invert the weight-scaling that was done initially
storea(ixl * rscalev + low_boundv, weight_set_out + i);
vint scn = pack_low_bytes(weight);
store_nbytes(scn, quantized_weight_set + i);
pack_and_store_low_bytes(weight, quantized_weight_set + i);
}
}
else
Expand Down Expand Up @@ -1084,8 +1083,7 @@ void compute_quantized_weights_for_decimation(

// Invert the weight-scaling that was done initially
storea(ixl * rscalev + low_boundv, weight_set_out + i);
vint scn = pack_low_bytes(weight);
store_nbytes(scn, quantized_weight_set + i);
pack_and_store_low_bytes(weight, quantized_weight_set + i);
}
}
}
Expand Down
8 changes: 6 additions & 2 deletions Source/astcenc_mathlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,14 @@
#endif
#endif

#ifndef ASTCENC_SVE
#define ASTCENC_SVE 0
#endif

// Force vector-sized SIMD alignment
#if ASTCENC_AVX
#if ASTCENC_AVX || ASTCENC_SVE == 8
#define ASTCENC_VECALIGN 32
#elif ASTCENC_SSE || ASTCENC_NEON
#elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4
#define ASTCENC_VECALIGN 16
// Use default alignment for non-SIMD builds
#else
Expand Down
Loading

0 comments on commit 213d6c2

Please sign in to comment.