riscv-software-src · aswaterman · Jun 20, 2023 · May 31, 2023 · Jun 2, 2023 · Jun 19, 2023
diff --git a/riscv/arith.h b/riscv/arith.h
@@ -7,6 +7,7 @@
 #include <cstdint>
 #include <climits>
 #include <cstddef>
+#include <type_traits>
 
 inline uint64_t mulhu(uint64_t a, uint64_t b)
 {
@@ -221,4 +222,24 @@ static inline uint64_t xperm(uint64_t rs1, uint64_t rs2, size_t sz_log2, size_t
   return r;
 }
 
+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_right(T x, std::size_t shiftamt) {
+  static_assert(std::is_unsigned<T>::value);
+  static constexpr T mask = (8 * sizeof(T)) - 1;
+  const std::size_t rshift = shiftamt & mask;
+  const std::size_t lshift = (-rshift) & mask;
+  return (x << lshift) | (x >> rshift);
+}
+
+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_left(T x, std::size_t shiftamt) {
+  static_assert(std::is_unsigned<T>::value);
+  static constexpr T mask = (8 * sizeof(T)) - 1;
+  const std::size_t lshift = shiftamt & mask;
+  const std::size_t rshift = (-lshift) & mask;
+  return (x << lshift) | (x >> rshift);
+}
+
 #endif
diff --git a/riscv/decode.h b/riscv/decode.h
@@ -140,6 +140,7 @@ class insn_t
   uint64_t v_vta() { return x(26, 1); }
   uint64_t v_vma() { return x(27, 1); }
   uint64_t v_mew() { return x(28, 1); }
+  uint64_t v_zimm6() { return x(15, 5) + (x(26, 1) << 5); }
 
   uint64_t p_imm2() { return x(20, 2); }
   uint64_t p_imm3() { return x(20, 3); }

diff --git a/riscv/insns/sm4_common.h b/riscv/insns/sm4_common.h
@@ -24,4 +24,3 @@ static const uint8_t sm4_sbox[256] = {
 	0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E,
 	0xD7, 0xCB, 0x39, 0x48
 };
-
diff --git a/riscv/insns/vaesdf_vs.h b/riscv/insns/vaesdf_vs.h
@@ -0,0 +1,43 @@
+// vaesdf.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd does receive the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, scalar_key);
+    // InvMixColumns is not performed in the final round.
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesdf_vv.h b/riscv/insns/vaesdf_vv.h
@@ -0,0 +1,37 @@
+// vaesdf.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd in contains the input state,
+    //  - vs2 contains the input round key,
+    //  - vd out receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, round_key);
+    // InvMixColumns is not performed in the final round.
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesdm_vs.h b/riscv/insns/vaesdm_vs.h
@@ -0,0 +1,44 @@
+// vaesdm.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd in contains the input state,
+    //  - vs2 contains the input round key,
+    //  - vd out receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, scalar_key);
+    // InvMixColumns
+    VAES_INV_MIX_COLUMNS(aes_state);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesdm_vv.h b/riscv/insns/vaesdm_vv.h
@@ -0,0 +1,38 @@
+// vaesdm.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd does receive the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, round_key);
+    // InvMixColumns
+    VAES_INV_MIX_COLUMNS(aes_state);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesef_vs.h b/riscv/insns/vaesef_vs.h
@@ -0,0 +1,43 @@
+// vaesef.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns is not performed for the final round.
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, scalar_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesef_vv.h b/riscv/insns/vaesef_vv.h
@@ -0,0 +1,37 @@
+// vaesef.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns is not performed for the final round.
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, round_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesem_vs.h b/riscv/insns/vaesem_vs.h
@@ -0,0 +1,44 @@
+// vaesem.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns
+    VAES_MIX_COLUMNS(aes_state);
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, scalar_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesem_vv.h b/riscv/insns/vaesem_vv.h
@@ -0,0 +1,38 @@
+// vaesem.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns
+    VAES_MIX_COLUMNS(aes_state);
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, round_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);