ARM-software · solidpixel · Oct 8, 2024 · Sep 21, 2024 · Sep 21, 2024 · Sep 21, 2024
diff --git a/Docs/ChangeLog-4x.md b/Docs/ChangeLog-4x.md
@@ -11,14 +11,21 @@ clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
 
 **Status:** In development
 
-The 4.9.0 release is a minor maintenance release.
+The 4.9.0 release is a small release adding support for Arm Scalable Vector
+Extensions SIMD, as well as some minor bug fixes.
 
 * **General:**
   * **Bug fix:** Fixed incorrect return type in "None" vector library
     reference implementation.
   * **Bug fix:** Fixed sincos table index under/overflow.
-  * **Feature:** Added backend for Arm SVE fixed-width 256-bit builds.
-  * **Feature:** Added backend for Arm SVE fixed-width 128-bit builds.
+  * **Feature:** Changed `ASTCENC_ISA_NATIVE` builds to use `-march=native` and
+    `-mcpu=native`.
+  * **Feature:** Added backend for Arm SVE fixed-width 256-bit builds. These
+    can only run on hardware implementing 256-bit SVE.
+  * **Feature:** Added backend for Arm SVE 128-bit builds. These are portable
+    builds and can run on hardware implemnting any SVE vector length, but the
+    explicit SVE use is augmented NEON and will only use the bottom 128-bits of
+    each SVE vector.
   * **Feature:** Optimized NEON mask `any()` and `all()` functions.
   * **Feature:** Migrated build and test to GitHub Actions pipelines.
 
@@ -36,8 +43,9 @@ The 4.8.0 release is a minor maintenance release.
     language behavior, to improve support for deployment using Emscripten.
   * **Feature:** Builds using Clang can now build with undefined behavior
     sanitizer by setting `-DASTCENC_UBSAN=ON` on the CMake configure line.
-  * **Feature:** Updated to Wuffs library 0.3.4, which ignores tRNS alpha chunks
-    for type 4 (LA) and 6 (RGBA) PNGs, to improve compatibility with libpng.
+  * **Feature:** Updated to Wuffs library 0.3.4, which ignores tRNS alpha
+    chunks for type 4 (LA) and 6 (RGBA) PNGs, to improve compatibility with
+    libpng.
 
 <!-- ---------------------------------------------------------------------- -->
 ## 4.7.0
@@ -49,8 +57,8 @@ the decompressor to match the Khronos specification. This fix includes the
 addition of explicit support for optimizing for `decode_unorm8` rounding.
 
 Reminder - the codec library API is not designed to be binary compatible across
-versions. We always recommend rebuilding your client-side code using the updated
-`astcenc.h` header.
+versions. We always recommend rebuilding your client-side code using the
+updated `astcenc.h` header.
 
 * **General:**
   * **Bug fix:** sRGB LDR decompression now uses the correct endpoint expansion

diff --git a/Source/UnitTest/cmake_core.cmake b/Source/UnitTest/cmake_core.cmake
@@ -117,7 +117,7 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_128")
     # Enable SVE
     target_compile_options(${ASTCENC_TEST}
         PRIVATE
-            -march=armv8-a+sve -msve-vector-bits=128)
+            -march=armv8-a+sve)
 
 elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
     target_compile_definitions(${ASTCENC_TEST}

diff --git a/Source/astcenc_mathlib.h b/Source/astcenc_mathlib.h
@@ -74,7 +74,18 @@
 #endif
 
 #ifndef ASTCENC_SVE
-  #define ASTCENC_SVE 0
+  #if defined(__ARM_FEATURE_SVE)
+    #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
+      #define ASTCENC_SVE 8
+    // Auto-detected SVE can only assume vector width of 4 is available, but
+    // must also allow for hardware being longer and so all use of intrinsics
+    // must explicitly use predicate masks to limit to 4-wide.
+    #else
+      #define ASTCENC_SVE 4
+    #endif
+    #else
+    #define ASTCENC_SVE 0
+  #endif
 #endif
 
 // Force vector-sized SIMD alignment

diff --git a/Source/astcenccli_entry2.cpp b/Source/astcenccli_entry2.cpp
@@ -57,8 +57,10 @@ int astcenc_main_veneer(
 	int argc,
 	char **argv
 ) {
-#if ASTCENC_SVE != 0
-	// svcntw() return compile-time length if used with -msve-vector-bits
+	// We don't need this check for 128-bit SVE, because that is compiled as
+	// VLA code, using predicate masks in the augmented NEON.
+#if ASTCENC_SVE > 4
+	// svcntw() returns compile-time length if used with -msve-vector-bits
 	if (svcntw() != ASTCENC_SVE)
 	{
 		int bits = ASTCENC_SVE * 32;

diff --git a/Source/astcenccli_toplevel_help.cpp b/Source/astcenccli_toplevel_help.cpp
@@ -585,6 +585,14 @@ void astcenc_print_header()
 	unsigned int bits = static_cast<unsigned int>(sizeof(void*) * 8);
 	printf(astcenc_copyright_string,
 	       VERSION_STRING, bits, simdtype, pcnttype, f16ctype, YEAR_STRING);
+
+    // If possible, print hint that 8-wide SVE could be used
+#if ASTCENC_SVE == 4
+    if (svcntw() == 8)
+    {
+        printf("Note: This CPU can support 256-bit SVE builds.\n");
+    }
+#endif
 }
 
 /* See header for documentation. */

diff --git a/Source/cmake_core.cmake b/Source/cmake_core.cmake
@@ -336,10 +336,13 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
                 ASTCENC_F16C=0)
 
         # Enable SVE in the core library
+        # Note that for 128-bit SVE the generated code is actually
+        # vector-length agnostic, but any manual intrinsics used in the
+        # enhanced-NEON library use 128-bit data width predicates
         if (NOT ${ASTCENC_VENEER_TYPE})
             target_compile_options(${ASTCENC_TARGET_NAME}
                 PRIVATE
-                    -march=armv8-a+sve -msve-vector-bits=128)
+                    -march=armv8-a+sve)
 
         # Enable SVE without fixed vector length in the veneer
         elseif (${ASTCENC_VENEER_TYPE} EQUAL 2)
@@ -429,6 +432,21 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
                     $<${is_gnu_fe}:-mfma>)
         endif()
 
+    elseif(${ASTCENC_ISA_SIMD} MATCHES "native")
+        target_compile_definitions(${ASTCENC_TARGET_NAME}
+            PRIVATE)
+
+        if (${ASTCENC_VENEER_TYPE} GREATER 0)
+            target_compile_options(${ASTCENC_TARGET_NAME}
+                PRIVATE
+                    $<${is_gnu_fe}:-Wno-unused-command-line-argument>)
+        else()
+            target_compile_options(${ASTCENC_TARGET_NAME}
+                PRIVATE
+                    $<${is_clangcl}:-mcpu=native -march=native>
+                    $<${is_gnu_fe}:-mcpu=native -march=native>
+                    $<${is_gnu_fe}:-Wno-unused-command-line-argument>)
+        endif()
     endif()
 
 endmacro()