Skip to content

Commit

Permalink
Zvk: Implement Zvksed, vector SM4 Block Cipher
Browse files Browse the repository at this point in the history
Implement the Zvksed sub-extension, "ShangMi Suite: SM4 Block Cipher":
 - vsm4k.vi, vector SM4 key expansion,
 - vsm4r.{vs,vv}, vector SM4 rounds.

This also introduces a header for common vector SM4 logic.

Co-authored-by: Raghav Gupta <[email protected]>
Co-authored-by: Albert Jakieła <[email protected]>
Signed-off-by: Eric Gouriou <[email protected]>
  • Loading branch information
3 people committed May 2, 2023
1 parent 6d3a082 commit 6878fcd
Show file tree
Hide file tree
Showing 6 changed files with 258 additions and 1 deletion.
1 change: 0 additions & 1 deletion riscv/insns/sm4_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,3 @@ static const uint8_t sm4_sbox[256] = {
0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E,
0xD7, 0xCB, 0x39, 0x48
};

73 changes: 73 additions & 0 deletions riscv/insns/vsm4k_vi.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// vsm4k.vi vd, vs2, round#

#include "zvksed_ext_macros.h"

// Uncomment to enable debug logging of invocations of this instruction.
//#define DLOG_INVOCATION

#if defined(DLOG_INVOCATION)
#define DLOG(...) ZVK_DBG_LOG(__VA_ARGS__)
// Print format/value for both state and round key element groups.
#define PRIxEG PRIxEGU32x4_LE
#define PRVEG(X) PRVEGU32x4_LE(X)
// Print format/value for "v<reg_num>(<Element Group in Hex, Little Endian>)"
#define PRI_uR_xEG PRI_uREG_xEGU32x8
#define PRV_R_EG(reg_num, reg) PRV_REG_EGU32x8_LE(reg_num, reg)
#else
#define DLOG(...) (void)(0)
#endif

// SM4 Constant Key (CK) - section 7.3.2. of the IETF draft.
static constexpr uint32_t zvksed_ck[32] = {
0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269,
0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9,
0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249,
0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9,
0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229,
0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299,
0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209,
0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
};

require_vsm4_constraints;

VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
{
DLOG("-- vsm4k_vi " ZVK_PRI_REGNUMS_VD_VS2_ZIMM5,
ZVK_PRV_REGNUMS_VD_VS2_ZIMM5);
},
// The following statements will be executed before the first execution
// of the loop, and only if the loop is going to be entered.
// We cannot use a block ( { ... } ) since we want the 'round' variable
// declared and defined here here to be visible in the loop block.
// Only consider the bottom 3 bits of the immediate, ensuring that
// 'round' is in the valid range [0, 7].
const reg_t round = zimm5 & 0x7;
if (round != zimm5) {
DLOG("vsm4k: zimm5 %" PRIuREG " => round %" PRIuREG, zimm5, round);
},
{
// {rk0, rk1, rk2, rk3} <- vs2
EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3);

uint32_t B = rk1 ^ rk2 ^ rk3 ^ zvksed_ck[4 * round];
uint32_t S = ZVKSED_SUB_BYTES(B);
uint32_t rk4 = ZVKSED_ROUND_KEY(rk0, S);

B = rk2 ^ rk3 ^ rk4 ^ zvksed_ck[4 * round + 1];
S = ZVKSED_SUB_BYTES(B);
uint32_t rk5 = ZVKSED_ROUND_KEY(rk1, S);

B = rk3 ^ rk4 ^ rk5 ^ zvksed_ck[4 * round + 2];
S = ZVKSED_SUB_BYTES(B);
uint32_t rk6 = ZVKSED_ROUND_KEY(rk2, S);

B = rk4 ^ rk5 ^ rk6 ^ zvksed_ck[4 * round + 3];
S = ZVKSED_SUB_BYTES(B);
uint32_t rk7 = ZVKSED_ROUND_KEY(rk3, S);

// Update the destination register.
SET_EGU32x4_LE(vd, rk4, rk5, rk6, rk7);
DLOG("= vsm4k_vi %" PRIxEG, PRVEG(vd));
}
);
74 changes: 74 additions & 0 deletions riscv/insns/vsm4r_vs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// vsm4r.vs vd, vs2

#include "zvksed_ext_macros.h"

// Uncomment to enable debug logging of invocations of this instruction.
//#define DLOG_INVOCATION

#if defined(DLOG_INVOCATION)
#define DLOG(...) ZVK_DBG_LOG(__VA_ARGS__)
// Print format/value for both state and round key element groups.
#define PRIxEG PRIxEGU32x4_LE
#define PRVEG(X) PRVEGU32x4_LE(X)
// Print format/value for "v<reg_num>(<Element Group in Hex, Big Endian>)"
#define PRI_uR_xEG PRI_uREG_xEGU32x4
#define PRV_R_EG(reg_num, reg) PRV_REG_EGU32x4_LE(reg_num, reg)
#else
#define DLOG(...) (void)(0)
#endif

require_vsm4_constraints;
// No overlap of vd and vs2.
require(insn.rd() != insn.rs2());

VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
{
DLOG("vsm4r_vs vd#(%" PRIuREG ") vs2#(%" PRIuREG ")"
" vstart_eg(%" PRIuREG ") vl_eg(%" PRIuREG ")",
vd_num, vs2_num, vstart_eg, vl_eg);
},
// This statement will be executed before the first execution
// of the loop, and only if the loop is going to be entered.
// We cannot use a block ( { ... } ) since we want the variables declared
// here to be visible in the loop block.
// We capture the "scalar", vs2's first element, by copy, even though
// the "no overlap" constraint means that vs2 should remain constant
// during the loop.
const EGU32x4_t scalar_key = P.VU.elt_group<EGU32x4_t>(vs2_num, 0);
const uint32_t rk0 = scalar_key[0];
const uint32_t rk1 = scalar_key[1];
const uint32_t rk2 = scalar_key[2];
const uint32_t rk3 = scalar_key[3];,
{
EGU32x4_t &state = P.VU.elt_group<EGU32x4_t>(vd_num, idx_eg, true);

DLOG("vsm4r_vs " PRI_uR_xEG " " PRI_uR_xEG,
PRV_R_EG(vd_num, state), PRV_R_EG(vs2_num, scalar_key));

// {x0, x1,x2, x3} <- vd
EXTRACT_EGU32x4_WORDS_LE(state, x0, x1, x2, x3);

uint32_t B;
uint32_t S;

B = x1 ^ x2 ^ x3 ^ rk0;
S = ZVKSED_SUB_BYTES(B);
const uint32_t x4 = ZVKSED_ROUND(x0, S);

B = x2 ^ x3 ^ x4 ^ rk1;
S = ZVKSED_SUB_BYTES(B);
const uint32_t x5 = ZVKSED_ROUND(x1, S);

B = x3 ^ x4 ^ x5 ^ rk2;
S = ZVKSED_SUB_BYTES(B);
const uint32_t x6 = ZVKSED_ROUND(x2, S);

B = x4 ^ x5 ^ x6 ^ rk3;
S = ZVKSED_SUB_BYTES(B);
const uint32_t x7 = ZVKSED_ROUND(x3, S);

// Update the destination register.
SET_EGU32x4_LE(state, x4, x5, x6, x7);
DLOG("= vsm4r_vs v%" PRIuREG " <- (%" PRIxEG ")", vd_num, PRVEG(state));
}
);
60 changes: 60 additions & 0 deletions riscv/insns/vsm4r_vv.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// vsm4r.vv vd, vs2

#include "zvksed_ext_macros.h"

// Uncomment to enable debug logging of invocations of this instruction.
//#define DLOG_INVOCATION

#if defined(DLOG_INVOCATION)
#define DLOG(...) ZVK_DBG_LOG(__VA_ARGS__)
// Print format/value for both state and round key element groups.
#define PRIxEG PRIxEGU32x4_LE
#define PRVEG(X) PRVEGU32x4_LE(X)
// Print format/value for "v<reg_num>(<Element Group in Hex, Little Endian>)"
#define PRI_uR_xEG PRI_uREG_xEGU32x8
#define PRV_R_EG(reg_num, reg) PRV_REG_EGU32x8_LE(reg_num, reg)
#else
#define DLOG(...) (void)(0)
#endif

require_vsm4_constraints;

VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(
{
DLOG("vsm4r_vv vd#(%" PRIuREG ") vs2#(%" PRIuREG ")"
" vstart_eg(%" PRIuREG ") vl_eg(%" PRIuREG ")",
vd_num, vs2_num, vstart_eg, vl_eg);
},
{
DLOG("vsm4r_vv " PRI_uR_xEG " " PRI_uR_xEG,
PRV_R_EG(vd_num, vd), PRV_R_EG(vs2_num, vs2));

// vd = {x0, x1,x2, x3} <- vd
EXTRACT_EGU32x4_WORDS_LE(vd, x0, x1, x2, x3);
// {rk0, rk1, rk2, rk3} <- vs2
EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3);

uint32_t B;
uint32_t S;

B = x1 ^ x2 ^ x3 ^ rk0;
S = ZVKSED_SUB_BYTES(B);
const uint32_t x4 = ZVKSED_ROUND(x0, S);

B = x2 ^ x3 ^ x4 ^ rk1;
S = ZVKSED_SUB_BYTES(B);
const uint32_t x5 = ZVKSED_ROUND(x1, S);

B = x3 ^ x4 ^ x5 ^ rk2;
S = ZVKSED_SUB_BYTES(B);
const uint32_t x6 = ZVKSED_ROUND(x2, S);

B = x4 ^ x5 ^ x6 ^ rk3;
S = ZVKSED_SUB_BYTES(B);
const uint32_t x7 = ZVKSED_ROUND(x3, S);

// Update the destination register.
SET_EGU32x4_LE(vd, x4, x5, x6, x7);
DLOG("= vsm4r_vv v%" PRIuREG " <- (%" PRIxEG ")", vd_num, PRVEG(vd));
}
);
6 changes: 6 additions & 0 deletions riscv/riscv.mk.in
Original file line number Diff line number Diff line change
Expand Up @@ -1362,12 +1362,18 @@ riscv_insn_ext_zvknh = \
vsha2ch_vv \
vsha2ms_vv \

riscv_insn_ext_zvksed = \
vsm4k_vi \
vsm4r_vs \
vsm4r_vv \

riscv_insn_ext_zvk = \
$(riscv_insn_ext_zvbb) \
$(riscv_insn_ext_zvbc) \
$(riscv_insn_ext_zvkg) \
$(riscv_insn_ext_zvkned) \
$(riscv_insn_ext_zvknh) \
$(riscv_insn_ext_zvksed) \

# Note that riscv_insn_ext_p and riscv_insn_ext_zvk contain instructions
# that have conflicting encodings. They cannot be both included concurrently.
Expand Down
45 changes: 45 additions & 0 deletions riscv/zvksed_ext_macros.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// Helper macros and functions to help implement instructions defined as part of
// the RISC-V Zvksed extension (vectorized SM4).

#include "insns/sm4_common.h"
#include "zvk_ext_macros.h"

#ifndef RISCV_ZVKSED_MACROS_H_
#define RISCV_ZVKSED_MACROS_H_

// Constraints common to all vsm4* instructions:
// - Zvksed is enabled
// - VSEW == 32
// - EGW (128) <= LMUL * VLEN
//
// The constraint that vstart and vl are both EGS (4) aligned
// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
#define require_vsm4_constraints \
do { \
require_zvksed; \
require(P.VU.vsew == 32); \
require_egw_fits(128); \
} while (false)

// Get byte BYTE of the SBox.
#define ZVKSED_SBOX(BYTE) (sm4_sbox[(BYTE)])

// Apply the nonlinear transformation tau to a 32 bit word B - section 6.2.1.
// of the IETF draft.
#define ZVKSED_SUB_BYTES(B) \
U32_FROM_U8_LE(ZVKSED_SBOX(EXTRACT_U8((B), 0)), \
ZVKSED_SBOX(EXTRACT_U8((B), 1)), \
ZVKSED_SBOX(EXTRACT_U8((B), 2)), \
ZVKSED_SBOX(EXTRACT_U8((B), 3)))

// Perform the linear transformation L to a 32 bit word S and xor it with a 32
// bit word X - section 6.2.2. of the IETF draft.
#define ZVKSED_ROUND(X, S) \
((X) ^ ((S) ^ ROL32((S), 2) ^ ROL32((S), 10) ^ ROL32((S), 18) ^ ROL32((S), 24)))

// Perform the linear transformation L' to a 32 bit word S and xor it with a 32
// bit word X - section 6.2.2. of the IETF draft.
#define ZVKSED_ROUND_KEY(X, S) \
((X) ^ ((S) ^ ROL32((S), 13) ^ ROL32((S), 23)))

#endif // RISCV_ZVKSED_MACROS_H_

0 comments on commit 6878fcd

Please sign in to comment.