From 960755b19263cb2924dd2906482600b4ad99e21f Mon Sep 17 00:00:00 2001
From: Jiajie Chen <c@jia.je>
Date: Mon, 13 Mar 2023 12:09:08 +0800
Subject: [PATCH 001/110] Implement pending bits for plic

---
 riscv/devices.h |  1 +
 riscv/plic.cc   | 22 +++++++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/riscv/devices.h b/riscv/devices.h
index 0b12f00f44..02d9e98068 100644
--- a/riscv/devices.h
+++ b/riscv/devices.h
@@ -114,6 +114,7 @@ class plic_t : public abstract_device_t, public abstract_interrupt_controller_t
   uint32_t context_claim(plic_context_t *c);
   bool priority_read(reg_t offset, uint32_t *val);
   bool priority_write(reg_t offset, uint32_t val);
+  bool pending_read(reg_t offset, uint32_t *val);
   bool context_enable_read(const plic_context_t *context,
                            reg_t offset, uint32_t *val);
   bool context_enable_write(plic_context_t *context,
diff --git a/riscv/plic.cc b/riscv/plic.cc
index aeec229a57..37a5f53ba0 100644
--- a/riscv/plic.cc
+++ b/riscv/plic.cc
@@ -48,6 +48,9 @@
 #define PRIORITY_BASE           0
 #define PRIORITY_PER_ID         4
 
+/* Each interrupt source has a pending bit associated with it. */
+#define PENDING_BASE            0x1000
+
 /*
  * Each hart context has a vector of interupt enable bits associated with it.
  * There's one bit for each interrupt source.
@@ -156,6 +159,21 @@ bool plic_t::priority_write(reg_t offset, uint32_t val)
   return true;
 }
 
+bool plic_t::pending_read(reg_t offset, uint32_t *val)
+{
+  uint32_t id_word = (offset >> 2);
+
+  if (id_word < num_ids_word) {
+    *val = 0;
+    for (auto context: contexts) {
+        *val |= context.pending[id_word];
+    }
+  } else
+    *val = 0;
+
+  return true;
+}
+
 bool plic_t::context_enable_read(const plic_context_t *c,
                                  reg_t offset, uint32_t *val)
 {
@@ -313,8 +331,10 @@ bool plic_t::load(reg_t addr, size_t len, uint8_t* bytes)
       return false;
   }
 
-  if (PRIORITY_BASE <= addr && addr < ENABLE_BASE) {
+  if (PRIORITY_BASE <= addr && addr < PENDING_BASE) {
     ret = priority_read(addr, &val);
+  } else if (PENDING_BASE <= addr && addr < ENABLE_BASE) {
+    ret = pending_read(addr - PENDING_BASE, &val);
   } else if (ENABLE_BASE <= addr && addr < CONTEXT_BASE) {
     uint32_t cntx = (addr - ENABLE_BASE) / ENABLE_PER_HART;
     addr -= cntx * ENABLE_PER_HART + ENABLE_BASE;

From 5afa62e541b67a5ebd03a8783858484afc7920e1 Mon Sep 17 00:00:00 2001
From: Yinan Xu <xuyinan@ict.ac.cn>
Date: Thu, 20 Apr 2023 16:47:04 +0800
Subject: [PATCH 002/110] Close file descriptors in destructor of syscall_t

---
 fesvr/syscall.cc | 13 ++++++++++---
 fesvr/syscall.h  |  2 ++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/fesvr/syscall.cc b/fesvr/syscall.cc
index 875ffb7321..e277be19bf 100644
--- a/fesvr/syscall.cc
+++ b/fesvr/syscall.cc
@@ -174,9 +174,16 @@ syscall_t::syscall_t(htif_t* htif)
   if (stdin_fd < 0 || stdout_fd0 < 0 || stdout_fd1 < 0)
     throw std::runtime_error("could not dup stdin/stdout");
 
-  fds.alloc(stdin_fd); // stdin -> stdin
-  fds.alloc(stdout_fd0); // stdout -> stdout
-  fds.alloc(stdout_fd1); // stderr -> stdout
+  fds_index.push_back(fds.alloc(stdin_fd)); // stdin -> stdin
+  fds_index.push_back(fds.alloc(stdout_fd0)); // stdout -> stdout
+  fds_index.push_back(fds.alloc(stdout_fd1)); // stderr -> stdout
+}
+
+syscall_t::~syscall_t() {
+  for (auto i: fds_index) {
+    close(fds.lookup(i));
+    fds.dealloc(i);
+  }
 }
 
 std::string syscall_t::do_chroot(const char* fn)
diff --git a/fesvr/syscall.h b/fesvr/syscall.h
index 4915efd689..c002e6c66e 100644
--- a/fesvr/syscall.h
+++ b/fesvr/syscall.h
@@ -28,6 +28,7 @@ class syscall_t : public device_t
 {
  public:
   syscall_t(htif_t*);
+  ~syscall_t();
 
   void set_chroot(const char* where);
   
@@ -38,6 +39,7 @@ class syscall_t : public device_t
   memif_t* memif;
   std::vector<syscall_func_t> table;
   fds_t fds;
+  std::vector<reg_t> fds_index;
 
   void handle_syscall(command_t cmd);
   void dispatch(addr_t mm);

From 6023896b0a4d8bf101b210b155721d74a2a06b0c Mon Sep 17 00:00:00 2001
From: Parshintsev Anatoly <anatoly.parshintsev@syntacore.com>
Date: Mon, 24 Apr 2023 21:36:36 +0300
Subject: [PATCH 003/110] fixup sb_write/sb_read to handle exceptions properly

system bus read/write operations could lead to a variety of
memory-related exceptions. Before this patch not every memory
exception was handled. This could lead to simulator crashes:
an example is when debugger (like OpenOCD) issues non-aligned
memory read.

Signed-off-by: Parshintsev Anatoly <anatoly.parshintsev@syntacore.com>
---
 riscv/debug_module.cc | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/riscv/debug_module.cc b/riscv/debug_module.cc
index 27dbe66ecb..9018ccf544 100644
--- a/riscv/debug_module.cc
+++ b/riscv/debug_module.cc
@@ -314,7 +314,7 @@ void debug_module_t::sb_read()
     } else {
       sbcs.error = 3;
     }
-  } catch (trap_load_access_fault& t) {
+  } catch (const mem_trap_t& ) {
     sbcs.error = 2;
   }
 }
@@ -323,17 +323,21 @@ void debug_module_t::sb_write()
 {
   reg_t address = ((uint64_t) sbaddress[1] << 32) | sbaddress[0];
   D(fprintf(stderr, "sb_write() 0x%x @ 0x%lx\n", sbdata[0], address));
-  if (sbcs.sbaccess == 0 && config.max_sba_data_width >= 8) {
-    sim->debug_mmu->store<uint8_t>(address, sbdata[0]);
-  } else if (sbcs.sbaccess == 1 && config.max_sba_data_width >= 16) {
-    sim->debug_mmu->store<uint16_t>(address, sbdata[0]);
-  } else if (sbcs.sbaccess == 2 && config.max_sba_data_width >= 32) {
-    sim->debug_mmu->store<uint32_t>(address, sbdata[0]);
-  } else if (sbcs.sbaccess == 3 && config.max_sba_data_width >= 64) {
-    sim->debug_mmu->store<uint64_t>(address,
-        (((uint64_t) sbdata[1]) << 32) | sbdata[0]);
-  } else {
-    sbcs.error = 3;
+  try {
+    if (sbcs.sbaccess == 0 && config.max_sba_data_width >= 8) {
+      sim->debug_mmu->store<uint8_t>(address, sbdata[0]);
+    } else if (sbcs.sbaccess == 1 && config.max_sba_data_width >= 16) {
+      sim->debug_mmu->store<uint16_t>(address, sbdata[0]);
+    } else if (sbcs.sbaccess == 2 && config.max_sba_data_width >= 32) {
+      sim->debug_mmu->store<uint32_t>(address, sbdata[0]);
+    } else if (sbcs.sbaccess == 3 && config.max_sba_data_width >= 64) {
+      sim->debug_mmu->store<uint64_t>(address,
+          (((uint64_t) sbdata[1]) << 32) | sbdata[0]);
+    } else {
+      sbcs.error = 3;
+    }
+  } catch (const mem_trap_t& ) {
+    sbcs.error = 2;
   }
 }
 

From 83c19daaf202e101e2a79109de4118a02f9d90a7 Mon Sep 17 00:00:00 2001
From: YenHaoChen <howard25336284@gmail.com>
Date: Wed, 12 Apr 2023 15:35:43 +0800
Subject: [PATCH 004/110] triggers: native triggers (action=0) should prevent
 causing a breakpoint exception while already in a trap handler

This commit implements Debug Specification Section 5.4 Native Triggers.

The specification allows two solutions for solving the reentrancy
problem. This commit chooses the first solution because the second one
targets implementations without S-mode.
---
 riscv/triggers.cc | 21 ++++++++++++++++++---
 riscv/triggers.h  |  1 +
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/riscv/triggers.cc b/riscv/triggers.cc
index 07cd6bf765..86dcc81a11 100644
--- a/riscv/triggers.cc
+++ b/riscv/triggers.cc
@@ -106,6 +106,21 @@ bool trigger_t::textra_match(processor_t * const proc) const noexcept
   return true;
 }
 
+bool trigger_t::allow_action(const state_t * const state) const
+{
+  if (get_action() == ACTION_DEBUG_EXCEPTION) {
+    const bool mstatus_mie = state->mstatus->read() & MSTATUS_MIE;
+    const bool sstatus_sie = state->sstatus->read() & MSTATUS_SIE;
+    const bool vsstatus_sie = state->vsstatus->read() & MSTATUS_SIE;
+    const bool medeleg_breakpoint = (state->medeleg->read() >> CAUSE_BREAKPOINT) & 1;
+    const bool hedeleg_breakpoint = (state->hedeleg->read() >> CAUSE_BREAKPOINT) & 1;
+    return (state->prv != PRV_M || mstatus_mie) &&
+           (state->prv != PRV_S || state->v || !medeleg_breakpoint || sstatus_sie) &&
+           (state->prv != PRV_S || !state->v || !medeleg_breakpoint || !hedeleg_breakpoint || vsstatus_sie);
+  }
+  return true;
+}
+
 reg_t disabled_trigger_t::tdata1_read(const processor_t * const proc) const noexcept
 {
   auto xlen = proc->get_xlen();
@@ -212,7 +227,7 @@ std::optional<match_result_t> mcontrol_common_t::detect_memory_access_match(proc
     value &= 0xffffffff;
   }
 
-  if (simple_match(xlen, value)) {
+  if (simple_match(xlen, value) && allow_action(proc->get_state())) {
     /* This is OK because this function is only called if the trigger was not
      * inhibited by the previous trigger in the chain. */
     hit = true;
@@ -289,7 +304,7 @@ void mcontrol6_t::tdata1_write(processor_t * const proc, const reg_t val, const
 
 std::optional<match_result_t> icount_t::detect_icount_match(processor_t * const proc) noexcept
 {
-  if (!common_match(proc))
+  if (!common_match(proc) || !allow_action(proc->get_state()))
     return std::nullopt;
 
   std::optional<match_result_t> ret = std::nullopt;
@@ -389,7 +404,7 @@ std::optional<match_result_t> trap_common_t::detect_trap_match(processor_t * con
   bool interrupt = (t.cause() & ((reg_t)1 << (xlen - 1))) != 0;
   reg_t bit = t.cause() & ~((reg_t)1 << (xlen - 1));
   assert(bit < xlen);
-  if (simple_match(interrupt, bit)) {
+  if (simple_match(interrupt, bit) && allow_action(proc->get_state())) {
     hit = true;
     return match_result_t(TIMING_AFTER, action);
   }
diff --git a/riscv/triggers.h b/riscv/triggers.h
index 0c9dabc55b..6e3d74d8c1 100644
--- a/riscv/triggers.h
+++ b/riscv/triggers.h
@@ -90,6 +90,7 @@ class trigger_t {
 protected:
   static action_t legalize_action(reg_t val, reg_t action_mask, reg_t dmode_mask) noexcept;
   bool common_match(processor_t * const proc) const noexcept;
+  bool allow_action(const state_t * const state) const;
   reg_t tdata2;
 
   bool vs = false;

From e25fb13bcd2fb2b108c3fe1d94831229adeb7e37 Mon Sep 17 00:00:00 2001
From: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date: Tue, 9 May 2023 17:32:05 +0200
Subject: [PATCH 005/110] Zfa: fix NX handling for the fround/froundnx family

The initial implementation (together with the SAIL code and the tests)
had gotten the NX variants backwards (as in 'an inexact result is ok'
vs. 'set NX is inexact').  Update all 4 instructions.

Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
---
 riscv/insns/fround_d.h   | 2 +-
 riscv/insns/fround_h.h   | 2 +-
 riscv/insns/fround_q.h   | 2 +-
 riscv/insns/fround_s.h   | 2 +-
 riscv/insns/froundnx_d.h | 2 +-
 riscv/insns/froundnx_h.h | 2 +-
 riscv/insns/froundnx_q.h | 2 +-
 riscv/insns/froundnx_s.h | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/riscv/insns/fround_d.h b/riscv/insns/fround_d.h
index 1127135093..0e8a1ba6d2 100644
--- a/riscv/insns/fround_d.h
+++ b/riscv/insns/fround_d.h
@@ -1,5 +1,5 @@
 require_extension('D');
 require_extension(EXT_ZFA);
 require_fp;
-WRITE_FRD_D(f64_roundToInt(FRS1_D, RM, true));
+WRITE_FRD_D(f64_roundToInt(FRS1_D, RM, false));
 set_fp_exceptions;
diff --git a/riscv/insns/fround_h.h b/riscv/insns/fround_h.h
index 6417a39c6a..0c6cdae4c3 100644
--- a/riscv/insns/fround_h.h
+++ b/riscv/insns/fround_h.h
@@ -1,5 +1,5 @@
 require_extension(EXT_ZFH);
 require_extension(EXT_ZFA);
 require_fp;
-WRITE_FRD_H(f16_roundToInt(FRS1_H, RM, true));
+WRITE_FRD_H(f16_roundToInt(FRS1_H, RM, false));
 set_fp_exceptions;
diff --git a/riscv/insns/fround_q.h b/riscv/insns/fround_q.h
index 51ebce2378..91bab77e27 100644
--- a/riscv/insns/fround_q.h
+++ b/riscv/insns/fround_q.h
@@ -1,5 +1,5 @@
 require_extension('Q');
 require_extension(EXT_ZFA);
 require_fp;
-WRITE_FRD(f128_roundToInt(f128(FRS1), RM, true));
+WRITE_FRD(f128_roundToInt(f128(FRS1), RM, false));
 set_fp_exceptions;
diff --git a/riscv/insns/fround_s.h b/riscv/insns/fround_s.h
index 272897ed13..f6e75f5d3e 100644
--- a/riscv/insns/fround_s.h
+++ b/riscv/insns/fround_s.h
@@ -1,5 +1,5 @@
 require_extension('F');
 require_extension(EXT_ZFA);
 require_fp;
-WRITE_FRD_F(f32_roundToInt(FRS1_F, RM, true));
+WRITE_FRD_F(f32_roundToInt(FRS1_F, RM, false));
 set_fp_exceptions;
diff --git a/riscv/insns/froundnx_d.h b/riscv/insns/froundnx_d.h
index 0e8a1ba6d2..1127135093 100644
--- a/riscv/insns/froundnx_d.h
+++ b/riscv/insns/froundnx_d.h
@@ -1,5 +1,5 @@
 require_extension('D');
 require_extension(EXT_ZFA);
 require_fp;
-WRITE_FRD_D(f64_roundToInt(FRS1_D, RM, false));
+WRITE_FRD_D(f64_roundToInt(FRS1_D, RM, true));
 set_fp_exceptions;
diff --git a/riscv/insns/froundnx_h.h b/riscv/insns/froundnx_h.h
index 0c6cdae4c3..6417a39c6a 100644
--- a/riscv/insns/froundnx_h.h
+++ b/riscv/insns/froundnx_h.h
@@ -1,5 +1,5 @@
 require_extension(EXT_ZFH);
 require_extension(EXT_ZFA);
 require_fp;
-WRITE_FRD_H(f16_roundToInt(FRS1_H, RM, false));
+WRITE_FRD_H(f16_roundToInt(FRS1_H, RM, true));
 set_fp_exceptions;
diff --git a/riscv/insns/froundnx_q.h b/riscv/insns/froundnx_q.h
index 91bab77e27..51ebce2378 100644
--- a/riscv/insns/froundnx_q.h
+++ b/riscv/insns/froundnx_q.h
@@ -1,5 +1,5 @@
 require_extension('Q');
 require_extension(EXT_ZFA);
 require_fp;
-WRITE_FRD(f128_roundToInt(f128(FRS1), RM, false));
+WRITE_FRD(f128_roundToInt(f128(FRS1), RM, true));
 set_fp_exceptions;
diff --git a/riscv/insns/froundnx_s.h b/riscv/insns/froundnx_s.h
index f6e75f5d3e..272897ed13 100644
--- a/riscv/insns/froundnx_s.h
+++ b/riscv/insns/froundnx_s.h
@@ -1,5 +1,5 @@
 require_extension('F');
 require_extension(EXT_ZFA);
 require_fp;
-WRITE_FRD_F(f32_roundToInt(FRS1_F, RM, false));
+WRITE_FRD_F(f32_roundToInt(FRS1_F, RM, true));
 set_fp_exceptions;

From 9be9108b47fcf3df066b9cab1d4e986b83764fab Mon Sep 17 00:00:00 2001
From: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date: Tue, 9 May 2023 21:50:31 +0200
Subject: [PATCH 006/110] Zfa: fli.q requires 'Q' not 'D'

Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
---
 riscv/insns/fli_q.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/riscv/insns/fli_q.h b/riscv/insns/fli_q.h
index 7ba569bf07..2ee9be23d0 100644
--- a/riscv/insns/fli_q.h
+++ b/riscv/insns/fli_q.h
@@ -1,4 +1,4 @@
-require_extension('D');
+require_extension('Q');
 require_extension(EXT_ZFA);
 require_fp;
 {

From 8ab77e8836d51851df9d227248bbec7c142730be Mon Sep 17 00:00:00 2001
From: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date: Tue, 9 May 2023 21:51:56 +0200
Subject: [PATCH 007/110] Zfa: fix bitpatterns for fli.q (entries 7, 18-25)

Ref #1327

Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
---
 riscv/insns/fli_q.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/riscv/insns/fli_q.h b/riscv/insns/fli_q.h
index 2ee9be23d0..24cce41d0a 100644
--- a/riscv/insns/fli_q.h
+++ b/riscv/insns/fli_q.h
@@ -10,7 +10,7 @@ require_fp;
     [0b00100] = 0x3FF7000000000000ull,  /* 1.0 * 2^-8  */
     [0b00101] = 0x3FF8000000000000ull,  /* 1.0 * 2^-7  */
     [0b00110] = 0x3FFB000000000000ull,  /* 1.0 * 2^-4  */
-    [0b00111] = 0x3FF9000000000000ull,  /* 1.0 * 2^-3  */
+    [0b00111] = 0x3FFC000000000000ull,  /* 1.0 * 2^-3  */
     [0b01000] = 0x3FFD000000000000ull,  /* 0.25 */
     [0b01001] = 0x3FFD400000000000ull,  /* 0.3125 */
     [0b01010] = 0x3FFD800000000000ull,  /* 0.375 */
@@ -21,14 +21,14 @@ require_fp;
     [0b01111] = 0x3FFEC00000000000ull,  /* 0.875 */
     [0b10000] = 0x3FFF000000000000ull,  /* 1.0 */
     [0b10001] = 0x3FFF400000000000ull,  /* 1.25 */
-    [0b10010] = 0x3FFFC00000000000ull,  /* 1.5 */
-    [0b10011] = 0x4000000000000000ull,  /* 1.75 */
-    [0b10100] = 0x4000400000000000ull,  /* 2.0 */
-    [0b10101] = 0x4000800000000000ull,  /* 2.5 */
-    [0b10110] = 0x4001000000000000ull,  /* 3 */
-    [0b10111] = 0x4002000000000000ull,  /* 4 */
-    [0b11000] = 0x4003000000000000ull,  /* 8 */
-    [0b11001] = 0x400D000000000000ull,  /* 16 */
+    [0b10010] = 0x3FFF800000000000ull,  /* 1.5 */
+    [0b10011] = 0x3FFFC00000000000ull,  /* 1.75 */
+    [0b10100] = 0x4000000000000000ull,  /* 2.0 */
+    [0b10101] = 0x4000400000000000ull,  /* 2.5 */
+    [0b10110] = 0x4000800000000000ull,  /* 3 */
+    [0b10111] = 0x4001000000000000ull,  /* 4 */
+    [0b11000] = 0x4002000000000000ull,  /* 8 */
+    [0b11001] = 0x4003000000000000ull,  /* 16 */
     [0b11010] = 0x4006000000000000ull,  /* 2^7 */
     [0b11011] = 0x4007000000000000ull,  /* 2^8 */
     [0b11100] = 0x400E000000000000ull,  /* 2^15 */

From d6731d58d43cbfd239f249803dc928d98fd996ca Mon Sep 17 00:00:00 2001
From: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date: Tue, 9 May 2023 23:33:04 +0200
Subject: [PATCH 008/110] Zfa: fix exception behaviour for fcvtmod.w.d

---
 riscv/insns/fcvtmod_w_d.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/riscv/insns/fcvtmod_w_d.h b/riscv/insns/fcvtmod_w_d.h
index 89b9b9522b..e39400d27e 100644
--- a/riscv/insns/fcvtmod_w_d.h
+++ b/riscv/insns/fcvtmod_w_d.h
@@ -38,12 +38,14 @@ if (exp == 0) {
   } else {
     /* The fraction is shifted out entirely.  */
     frac = 0;
+    inexact = true;
   }
 
-  /* Notice overflow or inexact exceptions.  */
+  /* Handle overflows */
   if (true_exp > 31 || frac > (sign ? 0x80000000ull : 0x7fffffff)) {
     /* Overflow, for which this operation raises invalid.  */
     invalid = true;
+    inexact = false;  /* invalid takes precedence */
   }
 
   /* Honor the sign.  */

From b07f893609df80d7bcaee470d041221bdea8c920 Mon Sep 17 00:00:00 2001
From: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date: Tue, 9 May 2023 23:57:10 +0200
Subject: [PATCH 009/110] Zfa: fix missing set_fp_exceptions for fleq/fltq

---
 riscv/insns/fleq_d.h | 1 +
 riscv/insns/fleq_h.h | 1 +
 riscv/insns/fleq_q.h | 1 +
 riscv/insns/fleq_s.h | 1 +
 riscv/insns/fltq_d.h | 1 +
 riscv/insns/fltq_h.h | 1 +
 riscv/insns/fltq_q.h | 1 +
 riscv/insns/fltq_s.h | 1 +
 8 files changed, 8 insertions(+)

diff --git a/riscv/insns/fleq_d.h b/riscv/insns/fleq_d.h
index 762e147039..5ceb96780f 100644
--- a/riscv/insns/fleq_d.h
+++ b/riscv/insns/fleq_d.h
@@ -2,3 +2,4 @@ require_extension('D');
 require_extension(EXT_ZFA);
 require_fp;
 WRITE_RD(f64_le_quiet(FRS1_D, FRS2_D));
+set_fp_exceptions;
diff --git a/riscv/insns/fleq_h.h b/riscv/insns/fleq_h.h
index 7e6db59ae5..7e6fd26fc8 100644
--- a/riscv/insns/fleq_h.h
+++ b/riscv/insns/fleq_h.h
@@ -2,3 +2,4 @@ require_extension(EXT_ZFH);
 require_extension(EXT_ZFA);
 require_fp;
 WRITE_RD(f16_le_quiet(FRS1_H, FRS2_H));
+set_fp_exceptions;
diff --git a/riscv/insns/fleq_q.h b/riscv/insns/fleq_q.h
index 8533d11d16..f80a32bf59 100644
--- a/riscv/insns/fleq_q.h
+++ b/riscv/insns/fleq_q.h
@@ -2,3 +2,4 @@ require_extension('Q');
 require_extension(EXT_ZFA);
 require_fp;
 WRITE_RD(f128_le_quiet(f128(FRS1), f128(FRS2)));
+set_fp_exceptions;
diff --git a/riscv/insns/fleq_s.h b/riscv/insns/fleq_s.h
index 8c0a909446..e3dc03f189 100644
--- a/riscv/insns/fleq_s.h
+++ b/riscv/insns/fleq_s.h
@@ -2,3 +2,4 @@ require_extension('F');
 require_extension(EXT_ZFA);
 require_fp;
 WRITE_RD(f32_le_quiet(FRS1_F, FRS2_F));
+set_fp_exceptions;
diff --git a/riscv/insns/fltq_d.h b/riscv/insns/fltq_d.h
index c7ec9f112e..7d116d596d 100644
--- a/riscv/insns/fltq_d.h
+++ b/riscv/insns/fltq_d.h
@@ -2,3 +2,4 @@ require_extension('D');
 require_extension(EXT_ZFA);
 require_fp;
 WRITE_RD(f64_lt_quiet(FRS1_D, FRS2_D));
+set_fp_exceptions;
diff --git a/riscv/insns/fltq_h.h b/riscv/insns/fltq_h.h
index 84d880a63c..177e545921 100644
--- a/riscv/insns/fltq_h.h
+++ b/riscv/insns/fltq_h.h
@@ -2,3 +2,4 @@ require_extension(EXT_ZFH);
 require_extension(EXT_ZFA);
 require_fp;
 WRITE_RD(f16_lt_quiet(FRS1_H, FRS2_H));
+set_fp_exceptions;
diff --git a/riscv/insns/fltq_q.h b/riscv/insns/fltq_q.h
index a65ca769b7..208d24869a 100644
--- a/riscv/insns/fltq_q.h
+++ b/riscv/insns/fltq_q.h
@@ -2,3 +2,4 @@ require_extension('Q');
 require_extension(EXT_ZFA);
 require_fp;
 WRITE_RD(f128_lt_quiet(f128(FRS1), f128(FRS2)));
+set_fp_exceptions;
diff --git a/riscv/insns/fltq_s.h b/riscv/insns/fltq_s.h
index 1ee09837b2..b2e1df55bb 100644
--- a/riscv/insns/fltq_s.h
+++ b/riscv/insns/fltq_s.h
@@ -2,3 +2,4 @@ require_extension('F');
 require_extension(EXT_ZFA);
 require_fp;
 WRITE_RD(f32_lt_quiet(FRS1_F, FRS2_F));
+set_fp_exceptions;

From 3286d262eb42f414e7e43c734532f309dee9fe81 Mon Sep 17 00:00:00 2001
From: Ryan Buchner <ryan.buchner@arilinc.com>
Date: Mon, 17 Apr 2023 18:53:27 -0700
Subject: [PATCH 010/110] Rename RISCV_XLATE_VIRT to RISCV_XLATE_FORCED_VIRT

More readable/understandable.
---
 riscv/mmu.cc | 6 +++---
 riscv/mmu.h  | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index be24f40f0c..f6f0a0d2ac 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -66,7 +66,7 @@ reg_t mmu_t::translate(reg_t addr, reg_t len, access_type type, uint32_t xlate_f
       if (get_field(proc->state.mstatus->read(), MSTATUS_MPV) && mode != PRV_M)
         virt = true;
     }
-    if (xlate_flags & RISCV_XLATE_VIRT) {
+    if (xlate_flags & RISCV_XLATE_FORCED_VIRT) {
       virt = true;
       mode = get_field(proc->state.hstatus->read(), HSTATUS_SPVP);
     }
@@ -236,7 +236,7 @@ void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, uint32_t xlate
   if ((addr & (len - 1)) == 0) {
     load_slow_path_intrapage(addr, len, bytes, xlate_flags);
   } else {
-    bool gva = ((proc) ? proc->state.v : false) || (RISCV_XLATE_VIRT & xlate_flags);
+    bool gva = ((proc) ? proc->state.v : false) || (RISCV_XLATE_FORCED_VIRT & xlate_flags);
     if (!is_misaligned_enabled())
       throw trap_load_address_misaligned(gva, addr, 0, 0);
 
@@ -284,7 +284,7 @@ void mmu_t::store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, uint32_
     check_triggers(triggers::OPERATION_STORE, addr, reg_from_bytes(len, bytes));
 
   if (addr & (len - 1)) {
-    bool gva = ((proc) ? proc->state.v : false) || (RISCV_XLATE_VIRT & xlate_flags);
+    bool gva = ((proc) ? proc->state.v : false) || (RISCV_XLATE_FORCED_VIRT & xlate_flags);
     if (!is_misaligned_enabled())
       throw trap_store_address_misaligned(gva, addr, 0, 0);
 
diff --git a/riscv/mmu.h b/riscv/mmu.h
index ef054cf59c..d63a43f4eb 100644
--- a/riscv/mmu.h
+++ b/riscv/mmu.h
@@ -51,7 +51,7 @@ class mmu_t
   mmu_t(simif_t* sim, endianness_t endianness, processor_t* proc);
   ~mmu_t();
 
-#define RISCV_XLATE_VIRT      (1U << 0)
+#define RISCV_XLATE_FORCED_VIRT (1U << 0)
 #define RISCV_XLATE_VIRT_HLVX (1U << 1)
 #define RISCV_XLATE_LR        (1U << 2)
 
@@ -81,12 +81,12 @@ class mmu_t
 
   template<typename T>
   T guest_load(reg_t addr) {
-    return load<T>(addr, RISCV_XLATE_VIRT);
+    return load<T>(addr, RISCV_XLATE_FORCED_VIRT);
   }
 
   template<typename T>
   T guest_load_x(reg_t addr) {
-    return load<T>(addr, RISCV_XLATE_VIRT|RISCV_XLATE_VIRT_HLVX);
+    return load<T>(addr, RISCV_XLATE_FORCED_VIRT|RISCV_XLATE_VIRT_HLVX);
   }
 
   template<typename T>
@@ -108,7 +108,7 @@ class mmu_t
 
   template<typename T>
   void guest_store(reg_t addr, T val) {
-    store(addr, val, RISCV_XLATE_VIRT);
+    store(addr, val, RISCV_XLATE_FORCED_VIRT);
   }
 
   // AMO/Zicbom faults should be reported as store faults

From d091f84af4ddc1e3c64c78d9cbac0277efd32554 Mon Sep 17 00:00:00 2001
From: rbuchner <ryan.buchner@arilinc.com>
Date: Fri, 21 Apr 2023 12:22:43 -0700
Subject: [PATCH 011/110] Add xlate_flags_t struct

Use xlate_flags_t rather than XLATE_FLAGS preprocessing directives
---
 riscv/mmu.cc | 34 +++++++++++++++----------------
 riscv/mmu.h  | 56 ++++++++++++++++++++++++++++++++--------------------
 2 files changed, 52 insertions(+), 38 deletions(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index f6f0a0d2ac..4c42610e1d 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -52,13 +52,13 @@ void throw_access_exception(bool virt, reg_t addr, access_type type)
   }
 }
 
-reg_t mmu_t::translate(reg_t addr, reg_t len, access_type type, uint32_t xlate_flags)
+reg_t mmu_t::translate(reg_t addr, reg_t len, access_type type, xlate_flags_t xlate_flags)
 {
   if (!proc)
     return addr;
 
   bool virt = proc->state.v;
-  bool hlvx = xlate_flags & RISCV_XLATE_VIRT_HLVX;
+  bool hlvx = xlate_flags.hlvx;
   reg_t mode = proc->state.prv;
   if (type != FETCH) {
     if (in_mprv()) {
@@ -66,7 +66,7 @@ reg_t mmu_t::translate(reg_t addr, reg_t len, access_type type, uint32_t xlate_f
       if (get_field(proc->state.mstatus->read(), MSTATUS_MPV) && mode != PRV_M)
         virt = true;
     }
-    if (xlate_flags & RISCV_XLATE_FORCED_VIRT) {
+    if (xlate_flags.forced_virt) {
       virt = true;
       mode = get_field(proc->state.hstatus->read(), HSTATUS_SPVP);
     }
@@ -85,7 +85,7 @@ tlb_entry_t mmu_t::fetch_slow_path(reg_t vaddr)
   tlb_entry_t result;
   reg_t vpn = vaddr >> PGSHIFT;
   if (unlikely(tlb_insn_tag[vpn % TLB_ENTRIES] != (vpn | TLB_CHECK_TRIGGERS))) {
-    reg_t paddr = translate(vaddr, sizeof(fetch_temp), FETCH, 0);
+    reg_t paddr = translate(vaddr, sizeof(fetch_temp), FETCH, {false, false, false});
     if (auto host_addr = sim->addr_to_mem(paddr)) {
       result = refill_tlb(vaddr, paddr, host_addr, FETCH);
     } else {
@@ -198,10 +198,10 @@ void mmu_t::check_triggers(triggers::operation_t operation, reg_t address, std::
     }
 }
 
-void mmu_t::load_slow_path_intrapage(reg_t addr, reg_t len, uint8_t* bytes, uint32_t xlate_flags)
+void mmu_t::load_slow_path_intrapage(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t xlate_flags)
 {
   reg_t vpn = addr >> PGSHIFT;
-  if (xlate_flags == 0 && vpn == (tlb_load_tag[vpn % TLB_ENTRIES] & ~TLB_CHECK_TRIGGERS)) {
+  if (!(xlate_flags.hlvx || xlate_flags.forced_virt || xlate_flags.lr) && vpn == (tlb_load_tag[vpn % TLB_ENTRIES] & ~TLB_CHECK_TRIGGERS)) {
     auto host_addr = tlb_data[vpn % TLB_ENTRIES].host_offset + addr;
     memcpy(bytes, host_addr, len);
     return;
@@ -209,7 +209,7 @@ void mmu_t::load_slow_path_intrapage(reg_t addr, reg_t len, uint8_t* bytes, uint
 
   reg_t paddr = translate(addr, len, LOAD, xlate_flags);
 
-  if ((xlate_flags & RISCV_XLATE_LR) && !sim->reservable(paddr)) {
+  if (xlate_flags.lr && !sim->reservable(paddr)) {
     throw trap_load_access_fault((proc) ? proc->state.v : false, addr, 0, 0);
   }
 
@@ -217,30 +217,30 @@ void mmu_t::load_slow_path_intrapage(reg_t addr, reg_t len, uint8_t* bytes, uint
     memcpy(bytes, host_addr, len);
     if (tracer.interested_in_range(paddr, paddr + PGSIZE, LOAD))
       tracer.trace(paddr, len, LOAD);
-    else if (xlate_flags == 0)
+    else if (!(xlate_flags.hlvx || xlate_flags.forced_virt || xlate_flags.lr))
       refill_tlb(addr, paddr, host_addr, LOAD);
 
   } else if (!mmio_load(paddr, len, bytes)) {
     throw trap_load_access_fault((proc) ? proc->state.v : false, addr, 0, 0);
   }
 
-  if (xlate_flags & RISCV_XLATE_LR) {
+  if (xlate_flags.lr) {
     load_reservation_address = paddr;
   }
 }
 
-void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, uint32_t xlate_flags)
+void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t xlate_flags)
 {
   check_triggers(triggers::OPERATION_LOAD, addr);
 
   if ((addr & (len - 1)) == 0) {
     load_slow_path_intrapage(addr, len, bytes, xlate_flags);
   } else {
-    bool gva = ((proc) ? proc->state.v : false) || (RISCV_XLATE_FORCED_VIRT & xlate_flags);
+    bool gva = ((proc) ? proc->state.v : false) || xlate_flags.forced_virt;
     if (!is_misaligned_enabled())
       throw trap_load_address_misaligned(gva, addr, 0, 0);
 
-    if (xlate_flags & RISCV_XLATE_LR)
+    if (xlate_flags.lr)
       throw trap_load_access_fault(gva, addr, 0, 0);
 
     reg_t len_page0 = std::min(len, PGSIZE - addr % PGSIZE);
@@ -252,10 +252,10 @@ void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, uint32_t xlate
   check_triggers(triggers::OPERATION_LOAD, addr, reg_from_bytes(len, bytes));
 }
 
-void mmu_t::store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* bytes, uint32_t xlate_flags, bool actually_store)
+void mmu_t::store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* bytes, xlate_flags_t xlate_flags, bool actually_store)
 {
   reg_t vpn = addr >> PGSHIFT;
-  if (xlate_flags == 0 && vpn == (tlb_store_tag[vpn % TLB_ENTRIES] & ~TLB_CHECK_TRIGGERS)) {
+  if (!(xlate_flags.hlvx || xlate_flags.forced_virt || xlate_flags.lr) && vpn == (tlb_store_tag[vpn % TLB_ENTRIES] & ~TLB_CHECK_TRIGGERS)) {
     if (actually_store) {
       auto host_addr = tlb_data[vpn % TLB_ENTRIES].host_offset + addr;
       memcpy(host_addr, bytes, len);
@@ -270,7 +270,7 @@ void mmu_t::store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* byte
       memcpy(host_addr, bytes, len);
       if (tracer.interested_in_range(paddr, paddr + PGSIZE, STORE))
         tracer.trace(paddr, len, STORE);
-      else if (xlate_flags == 0)
+      else if (!(xlate_flags.hlvx || xlate_flags.forced_virt || xlate_flags.lr))
         refill_tlb(addr, paddr, host_addr, STORE);
     } else if (!mmio_store(paddr, len, bytes)) {
       throw trap_store_access_fault((proc) ? proc->state.v : false, addr, 0, 0);
@@ -278,13 +278,13 @@ void mmu_t::store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* byte
   }
 }
 
-void mmu_t::store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, uint32_t xlate_flags, bool actually_store, bool UNUSED require_alignment)
+void mmu_t::store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, xlate_flags_t xlate_flags, bool actually_store, bool UNUSED require_alignment)
 {
   if (actually_store)
     check_triggers(triggers::OPERATION_STORE, addr, reg_from_bytes(len, bytes));
 
   if (addr & (len - 1)) {
-    bool gva = ((proc) ? proc->state.v : false) || (RISCV_XLATE_FORCED_VIRT & xlate_flags);
+    bool gva = ((proc) ? proc->state.v : false) || xlate_flags.forced_virt;
     if (!is_misaligned_enabled())
       throw trap_store_address_misaligned(gva, addr, 0, 0);
 
diff --git a/riscv/mmu.h b/riscv/mmu.h
index d63a43f4eb..3c0467212c 100644
--- a/riscv/mmu.h
+++ b/riscv/mmu.h
@@ -38,6 +38,12 @@ struct tlb_entry_t {
   reg_t target_offset;
 };
 
+struct xlate_flags_t {
+  const bool forced_virt : 1;
+  const bool hlvx : 1;
+  const bool lr : 1;
+};
+
 void throw_access_exception(bool virt, reg_t addr, access_type type);
 
 // this class implements a processor's port into the virtual memory system.
@@ -51,18 +57,14 @@ class mmu_t
   mmu_t(simif_t* sim, endianness_t endianness, processor_t* proc);
   ~mmu_t();
 
-#define RISCV_XLATE_FORCED_VIRT (1U << 0)
-#define RISCV_XLATE_VIRT_HLVX (1U << 1)
-#define RISCV_XLATE_LR        (1U << 2)
-
   template<typename T>
-  T ALWAYS_INLINE load(reg_t addr, uint32_t xlate_flags = 0) {
+  T ALWAYS_INLINE load(reg_t addr, xlate_flags_t xlate_flags = {false, false, false}) {
     target_endian<T> res;
     reg_t vpn = addr >> PGSHIFT;
     bool aligned = (addr & (sizeof(T) - 1)) == 0;
     bool tlb_hit = tlb_load_tag[vpn % TLB_ENTRIES] == vpn;
 
-    if (likely(xlate_flags == 0 && aligned && tlb_hit)) {
+    if (likely(!(xlate_flags.hlvx || xlate_flags.forced_virt || xlate_flags.lr) && aligned && tlb_hit)) {
       res = *(target_endian<T>*)(tlb_data[vpn % TLB_ENTRIES].host_offset + addr);
     } else {
       load_slow_path(addr, sizeof(T), (uint8_t*)&res, xlate_flags);
@@ -76,26 +78,35 @@ class mmu_t
 
   template<typename T>
   T load_reserved(reg_t addr) {
-    return load<T>(addr, RISCV_XLATE_LR);
+    bool forced_virt = false;
+    bool hlvx = false;
+    bool lr = true;
+    return load<T>(addr, {forced_virt, hlvx, lr});
   }
 
   template<typename T>
   T guest_load(reg_t addr) {
-    return load<T>(addr, RISCV_XLATE_FORCED_VIRT);
+    bool forced_virt = true;
+    bool hlvx = false;
+    bool lr = false;
+    return load<T>(addr, {forced_virt, hlvx, lr});
   }
 
   template<typename T>
   T guest_load_x(reg_t addr) {
-    return load<T>(addr, RISCV_XLATE_FORCED_VIRT|RISCV_XLATE_VIRT_HLVX);
+    bool forced_virt = true;
+    bool hlvx = true;
+    bool lr = false;
+    return load<T>(addr, {forced_virt, hlvx, lr});
   }
 
   template<typename T>
-  void ALWAYS_INLINE store(reg_t addr, T val, uint32_t xlate_flags = 0) {
+  void ALWAYS_INLINE store(reg_t addr, T val, xlate_flags_t xlate_flags = {false, false, false}) {
     reg_t vpn = addr >> PGSHIFT;
     bool aligned = (addr & (sizeof(T) - 1)) == 0;
     bool tlb_hit = tlb_store_tag[vpn % TLB_ENTRIES] == vpn;
 
-    if (xlate_flags == 0 && likely(aligned && tlb_hit)) {
+    if (!(xlate_flags.hlvx || xlate_flags.forced_virt || xlate_flags.lr) && likely(aligned && tlb_hit)) {
       *(target_endian<T>*)(tlb_data[vpn % TLB_ENTRIES].host_offset + addr) = to_target(val);
     } else {
       target_endian<T> target_val = to_target(val);
@@ -108,7 +119,10 @@ class mmu_t
 
   template<typename T>
   void guest_store(reg_t addr, T val) {
-    store(addr, val, RISCV_XLATE_FORCED_VIRT);
+    bool forced_virt = true;
+    bool hlvx = false;
+    bool lr = false;
+    store(addr, val, {forced_virt, hlvx, lr});
   }
 
   // AMO/Zicbom faults should be reported as store faults
@@ -130,7 +144,7 @@ class mmu_t
   template<typename T, typename op>
   T amo(reg_t addr, op f) {
     convert_load_traps_to_store_traps({
-      store_slow_path(addr, sizeof(T), nullptr, 0, false, true);
+      store_slow_path(addr, sizeof(T), nullptr, {false, false, false}, false, true);
       auto lhs = load<T>(addr);
       store<T>(addr, f(lhs));
       return lhs;
@@ -164,7 +178,7 @@ class mmu_t
 
   void clean_inval(reg_t addr, bool clean, bool inval) {
     convert_load_traps_to_store_traps({
-      const reg_t paddr = translate(addr, blocksz, LOAD, 0) & ~(blocksz - 1);
+      const reg_t paddr = translate(addr, blocksz, LOAD, {false, false, false}) & ~(blocksz - 1);
       if (sim->reservable(paddr)) {
         if (tracer.interested_in_range(paddr, paddr + PGSIZE, LOAD))
           tracer.clean_invalidate(paddr, blocksz, clean, inval);
@@ -183,10 +197,10 @@ class mmu_t
   {
     if (vaddr & (size-1)) {
       // Raise either access fault or misaligned exception
-      store_slow_path(vaddr, size, nullptr, 0, false, true);
+      store_slow_path(vaddr, size, nullptr, {false, false, false}, false, true);
     }
 
-    reg_t paddr = translate(vaddr, 1, STORE, 0);
+    reg_t paddr = translate(vaddr, 1, STORE, {false, false, false});
     if (sim->reservable(paddr))
       return load_reservation_address == paddr;
     else
@@ -332,17 +346,17 @@ class mmu_t
 
   // handle uncommon cases: TLB misses, page faults, MMIO
   tlb_entry_t fetch_slow_path(reg_t addr);
-  void load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, uint32_t xlate_flags);
-  void load_slow_path_intrapage(reg_t addr, reg_t len, uint8_t* bytes, uint32_t xlate_flags);
-  void store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, uint32_t xlate_flags, bool actually_store, bool require_alignment);
-  void store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* bytes, uint32_t xlate_flags, bool actually_store);
+  void load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t xlate_flags);
+  void load_slow_path_intrapage(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t xlate_flags);
+  void store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, xlate_flags_t xlate_flags, bool actually_store, bool require_alignment);
+  void store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* bytes, xlate_flags_t xlate_flags, bool actually_store);
   bool mmio_fetch(reg_t paddr, size_t len, uint8_t* bytes);
   bool mmio_load(reg_t paddr, size_t len, uint8_t* bytes);
   bool mmio_store(reg_t paddr, size_t len, const uint8_t* bytes);
   bool mmio(reg_t paddr, size_t len, uint8_t* bytes, access_type type);
   bool mmio_ok(reg_t paddr, access_type type);
   void check_triggers(triggers::operation_t operation, reg_t address, std::optional<reg_t> data = std::nullopt);
-  reg_t translate(reg_t addr, reg_t len, access_type type, uint32_t xlate_flags);
+  reg_t translate(reg_t addr, reg_t len, access_type type, xlate_flags_t xlate_flags);
 
   reg_t pte_load(reg_t pte_paddr, reg_t addr, bool virt, access_type trap_type, size_t ptesize) {
     if (ptesize == 4)

From a0c5bf31ba22119bf365c5fcff262736d1b4ac49 Mon Sep 17 00:00:00 2001
From: rbuchner <ryan.buchner@arilinc.com>
Date: Mon, 24 Apr 2023 09:45:26 -0700
Subject: [PATCH 012/110] Add is_special_access() to xlate_flags_t

---
 riscv/mmu.cc | 8 ++++----
 riscv/mmu.h  | 8 ++++++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index 4c42610e1d..6a5fdeeb00 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -201,7 +201,7 @@ void mmu_t::check_triggers(triggers::operation_t operation, reg_t address, std::
 void mmu_t::load_slow_path_intrapage(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t xlate_flags)
 {
   reg_t vpn = addr >> PGSHIFT;
-  if (!(xlate_flags.hlvx || xlate_flags.forced_virt || xlate_flags.lr) && vpn == (tlb_load_tag[vpn % TLB_ENTRIES] & ~TLB_CHECK_TRIGGERS)) {
+  if (!xlate_flags.is_special_access() && vpn == (tlb_load_tag[vpn % TLB_ENTRIES] & ~TLB_CHECK_TRIGGERS)) {
     auto host_addr = tlb_data[vpn % TLB_ENTRIES].host_offset + addr;
     memcpy(bytes, host_addr, len);
     return;
@@ -217,7 +217,7 @@ void mmu_t::load_slow_path_intrapage(reg_t addr, reg_t len, uint8_t* bytes, xlat
     memcpy(bytes, host_addr, len);
     if (tracer.interested_in_range(paddr, paddr + PGSIZE, LOAD))
       tracer.trace(paddr, len, LOAD);
-    else if (!(xlate_flags.hlvx || xlate_flags.forced_virt || xlate_flags.lr))
+    else if (!xlate_flags.is_special_access())
       refill_tlb(addr, paddr, host_addr, LOAD);
 
   } else if (!mmio_load(paddr, len, bytes)) {
@@ -255,7 +255,7 @@ void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t
 void mmu_t::store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* bytes, xlate_flags_t xlate_flags, bool actually_store)
 {
   reg_t vpn = addr >> PGSHIFT;
-  if (!(xlate_flags.hlvx || xlate_flags.forced_virt || xlate_flags.lr) && vpn == (tlb_store_tag[vpn % TLB_ENTRIES] & ~TLB_CHECK_TRIGGERS)) {
+  if (!xlate_flags.is_special_access() && vpn == (tlb_store_tag[vpn % TLB_ENTRIES] & ~TLB_CHECK_TRIGGERS)) {
     if (actually_store) {
       auto host_addr = tlb_data[vpn % TLB_ENTRIES].host_offset + addr;
       memcpy(host_addr, bytes, len);
@@ -270,7 +270,7 @@ void mmu_t::store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* byte
       memcpy(host_addr, bytes, len);
       if (tracer.interested_in_range(paddr, paddr + PGSIZE, STORE))
         tracer.trace(paddr, len, STORE);
-      else if (!(xlate_flags.hlvx || xlate_flags.forced_virt || xlate_flags.lr))
+      else if (!xlate_flags.is_special_access())
         refill_tlb(addr, paddr, host_addr, STORE);
     } else if (!mmio_store(paddr, len, bytes)) {
       throw trap_store_access_fault((proc) ? proc->state.v : false, addr, 0, 0);
diff --git a/riscv/mmu.h b/riscv/mmu.h
index 3c0467212c..6e79539c11 100644
--- a/riscv/mmu.h
+++ b/riscv/mmu.h
@@ -42,6 +42,10 @@ struct xlate_flags_t {
   const bool forced_virt : 1;
   const bool hlvx : 1;
   const bool lr : 1;
+
+  bool is_special_access() const {
+    return forced_virt || hlvx || lr;
+  }
 };
 
 void throw_access_exception(bool virt, reg_t addr, access_type type);
@@ -64,7 +68,7 @@ class mmu_t
     bool aligned = (addr & (sizeof(T) - 1)) == 0;
     bool tlb_hit = tlb_load_tag[vpn % TLB_ENTRIES] == vpn;
 
-    if (likely(!(xlate_flags.hlvx || xlate_flags.forced_virt || xlate_flags.lr) && aligned && tlb_hit)) {
+    if (likely(!xlate_flags.is_special_access() && aligned && tlb_hit)) {
       res = *(target_endian<T>*)(tlb_data[vpn % TLB_ENTRIES].host_offset + addr);
     } else {
       load_slow_path(addr, sizeof(T), (uint8_t*)&res, xlate_flags);
@@ -106,7 +110,7 @@ class mmu_t
     bool aligned = (addr & (sizeof(T) - 1)) == 0;
     bool tlb_hit = tlb_store_tag[vpn % TLB_ENTRIES] == vpn;
 
-    if (!(xlate_flags.hlvx || xlate_flags.forced_virt || xlate_flags.lr) && likely(aligned && tlb_hit)) {
+    if (!xlate_flags.is_special_access() && likely(aligned && tlb_hit)) {
       *(target_endian<T>*)(tlb_data[vpn % TLB_ENTRIES].host_offset + addr) = to_target(val);
     } else {
       target_endian<T> target_val = to_target(val);

From 8a34e1a5b3bc68b915127af15ea9254f7d812727 Mon Sep 17 00:00:00 2001
From: Ryan Buchner <ryan.buchner@arilinc.com>
Date: Tue, 11 Apr 2023 10:42:58 -0700
Subject: [PATCH 013/110] Add structure (mem_access_info_t) for holding memory
 access information

Add complementary function for generating access information.

Update mmu_t::translate() to accept a mem_access_info_t.
---
 riscv/mmu.cc | 28 ++++++++++------------------
 riscv/mmu.h  | 34 +++++++++++++++++++++++++++++++---
 2 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index 6a5fdeeb00..e2341fb365 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -52,25 +52,16 @@ void throw_access_exception(bool virt, reg_t addr, access_type type)
   }
 }
 
-reg_t mmu_t::translate(reg_t addr, reg_t len, access_type type, xlate_flags_t xlate_flags)
+reg_t mmu_t::translate(mem_access_info_t access_info, reg_t len)
 {
+  reg_t addr = access_info.vaddr;
+  access_type type = access_info.type;
   if (!proc)
     return addr;
 
-  bool virt = proc->state.v;
-  bool hlvx = xlate_flags.hlvx;
-  reg_t mode = proc->state.prv;
-  if (type != FETCH) {
-    if (in_mprv()) {
-      mode = get_field(proc->state.mstatus->read(), MSTATUS_MPP);
-      if (get_field(proc->state.mstatus->read(), MSTATUS_MPV) && mode != PRV_M)
-        virt = true;
-    }
-    if (xlate_flags.forced_virt) {
-      virt = true;
-      mode = get_field(proc->state.hstatus->read(), HSTATUS_SPVP);
-    }
-  }
+  bool virt = access_info.effective_virt;
+  bool hlvx = access_info.flags.hlvx;
+  reg_t mode = (reg_t) access_info.effective_priv;
 
   reg_t paddr = walk(addr, type, mode, virt, hlvx) | (addr & (PGSIZE-1));
   if (!pmp_ok(paddr, len, type, mode))
@@ -80,12 +71,13 @@ reg_t mmu_t::translate(reg_t addr, reg_t len, access_type type, xlate_flags_t xl
 
 tlb_entry_t mmu_t::fetch_slow_path(reg_t vaddr)
 {
+  auto access_info = generate_access_info(vaddr, FETCH, {false, false, false});
   check_triggers(triggers::OPERATION_EXECUTE, vaddr);
 
   tlb_entry_t result;
   reg_t vpn = vaddr >> PGSHIFT;
   if (unlikely(tlb_insn_tag[vpn % TLB_ENTRIES] != (vpn | TLB_CHECK_TRIGGERS))) {
-    reg_t paddr = translate(vaddr, sizeof(fetch_temp), FETCH, {false, false, false});
+    reg_t paddr = translate(access_info, sizeof(fetch_temp));
     if (auto host_addr = sim->addr_to_mem(paddr)) {
       result = refill_tlb(vaddr, paddr, host_addr, FETCH);
     } else {
@@ -207,7 +199,7 @@ void mmu_t::load_slow_path_intrapage(reg_t addr, reg_t len, uint8_t* bytes, xlat
     return;
   }
 
-  reg_t paddr = translate(addr, len, LOAD, xlate_flags);
+  reg_t paddr = translate(generate_access_info(addr, LOAD, xlate_flags), len);
 
   if (xlate_flags.lr && !sim->reservable(paddr)) {
     throw trap_load_access_fault((proc) ? proc->state.v : false, addr, 0, 0);
@@ -263,7 +255,7 @@ void mmu_t::store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* byte
     return;
   }
 
-  reg_t paddr = translate(addr, len, STORE, xlate_flags);
+  reg_t paddr = translate(generate_access_info(addr, STORE, xlate_flags), len);
 
   if (actually_store) {
     if (auto host_addr = sim->addr_to_mem(paddr)) {
diff --git a/riscv/mmu.h b/riscv/mmu.h
index 6e79539c11..1039de1b99 100644
--- a/riscv/mmu.h
+++ b/riscv/mmu.h
@@ -48,6 +48,14 @@ struct xlate_flags_t {
   }
 };
 
+struct mem_access_info_t {
+  const reg_t vaddr;
+  const reg_t effective_priv;
+  const bool effective_virt;
+  const xlate_flags_t flags;
+  const access_type type;
+};
+
 void throw_access_exception(bool virt, reg_t addr, access_type type);
 
 // this class implements a processor's port into the virtual memory system.
@@ -57,6 +65,26 @@ class mmu_t
 private:
   std::map<reg_t, reg_t> alloc_cache;
   std::vector<std::pair<reg_t, reg_t >> addr_tbl;
+
+  mem_access_info_t generate_access_info(reg_t addr, access_type type, xlate_flags_t xlate_flags) {
+    if (!proc)
+      return {addr, 0, false, {false, false, false}, type};
+    bool virt = proc->state.v;
+    reg_t mode = proc->state.prv;
+    if (type != FETCH) {
+      if (in_mprv()) {
+        mode = get_field(proc->state.mstatus->read(), MSTATUS_MPP);
+        if (get_field(proc->state.mstatus->read(), MSTATUS_MPV) && mode != PRV_M)
+          virt = true;
+      }
+      if (xlate_flags.forced_virt) {
+        virt = true;
+        mode = get_field(proc->state.hstatus->read(), HSTATUS_SPVP);
+      }
+    }
+    return {addr, mode, virt, xlate_flags, type};
+  }
+
 public:
   mmu_t(simif_t* sim, endianness_t endianness, processor_t* proc);
   ~mmu_t();
@@ -182,7 +210,7 @@ class mmu_t
 
   void clean_inval(reg_t addr, bool clean, bool inval) {
     convert_load_traps_to_store_traps({
-      const reg_t paddr = translate(addr, blocksz, LOAD, {false, false, false}) & ~(blocksz - 1);
+        const reg_t paddr = translate(generate_access_info(addr, LOAD, {false, false, false}), blocksz) & ~(blocksz - 1);
       if (sim->reservable(paddr)) {
         if (tracer.interested_in_range(paddr, paddr + PGSIZE, LOAD))
           tracer.clean_invalidate(paddr, blocksz, clean, inval);
@@ -204,7 +232,7 @@ class mmu_t
       store_slow_path(vaddr, size, nullptr, {false, false, false}, false, true);
     }
 
-    reg_t paddr = translate(vaddr, 1, STORE, {false, false, false});
+    reg_t paddr = translate(generate_access_info(vaddr, STORE, {false, false, false}), 1);
     if (sim->reservable(paddr))
       return load_reservation_address == paddr;
     else
@@ -360,7 +388,7 @@ class mmu_t
   bool mmio(reg_t paddr, size_t len, uint8_t* bytes, access_type type);
   bool mmio_ok(reg_t paddr, access_type type);
   void check_triggers(triggers::operation_t operation, reg_t address, std::optional<reg_t> data = std::nullopt);
-  reg_t translate(reg_t addr, reg_t len, access_type type, xlate_flags_t xlate_flags);
+  reg_t translate(mem_access_info_t access_info, reg_t len);
 
   reg_t pte_load(reg_t pte_paddr, reg_t addr, bool virt, access_type trap_type, size_t ptesize) {
     if (ptesize == 4)

From 87690a5ed4b55a3938efac098ce68fbf4d7fb037 Mon Sep 17 00:00:00 2001
From: Ryan Buchner <ryan.buchner@arilinc.com>
Date: Mon, 17 Apr 2023 19:19:01 -0700
Subject: [PATCH 014/110] Adjust load_slow_path_intrapage to recieve a
 mem_access_info_t as input

---
 riscv/mmu.cc | 20 +++++++++++---------
 riscv/mmu.h  |  2 +-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index e2341fb365..34cd1703a0 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -190,18 +190,19 @@ void mmu_t::check_triggers(triggers::operation_t operation, reg_t address, std::
     }
 }
 
-void mmu_t::load_slow_path_intrapage(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t xlate_flags)
+void mmu_t::load_slow_path_intrapage(reg_t len, uint8_t* bytes, mem_access_info_t access_info)
 {
+  reg_t addr = access_info.vaddr;
   reg_t vpn = addr >> PGSHIFT;
-  if (!xlate_flags.is_special_access() && vpn == (tlb_load_tag[vpn % TLB_ENTRIES] & ~TLB_CHECK_TRIGGERS)) {
+  if (!access_info.flags.is_special_access() && vpn == (tlb_load_tag[vpn % TLB_ENTRIES] & ~TLB_CHECK_TRIGGERS)) {
     auto host_addr = tlb_data[vpn % TLB_ENTRIES].host_offset + addr;
     memcpy(bytes, host_addr, len);
     return;
   }
 
-  reg_t paddr = translate(generate_access_info(addr, LOAD, xlate_flags), len);
+  reg_t paddr = translate(access_info, len);
 
-  if (xlate_flags.lr && !sim->reservable(paddr)) {
+  if (access_info.flags.lr && !sim->reservable(paddr)) {
     throw trap_load_access_fault((proc) ? proc->state.v : false, addr, 0, 0);
   }
 
@@ -209,24 +210,25 @@ void mmu_t::load_slow_path_intrapage(reg_t addr, reg_t len, uint8_t* bytes, xlat
     memcpy(bytes, host_addr, len);
     if (tracer.interested_in_range(paddr, paddr + PGSIZE, LOAD))
       tracer.trace(paddr, len, LOAD);
-    else if (!xlate_flags.is_special_access())
+    else if (!access_info.flags.is_special_access())
       refill_tlb(addr, paddr, host_addr, LOAD);
 
   } else if (!mmio_load(paddr, len, bytes)) {
     throw trap_load_access_fault((proc) ? proc->state.v : false, addr, 0, 0);
   }
 
-  if (xlate_flags.lr) {
+  if (access_info.flags.lr) {
     load_reservation_address = paddr;
   }
 }
 
 void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t xlate_flags)
 {
+  auto access_info = generate_access_info(addr, LOAD, xlate_flags);
   check_triggers(triggers::OPERATION_LOAD, addr);
 
   if ((addr & (len - 1)) == 0) {
-    load_slow_path_intrapage(addr, len, bytes, xlate_flags);
+    load_slow_path_intrapage(len, bytes, access_info);
   } else {
     bool gva = ((proc) ? proc->state.v : false) || xlate_flags.forced_virt;
     if (!is_misaligned_enabled())
@@ -236,9 +238,9 @@ void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t
       throw trap_load_access_fault(gva, addr, 0, 0);
 
     reg_t len_page0 = std::min(len, PGSIZE - addr % PGSIZE);
-    load_slow_path_intrapage(addr, len_page0, bytes, xlate_flags);
+    load_slow_path_intrapage(len_page0, bytes, access_info);
     if (len_page0 != len)
-      load_slow_path_intrapage(addr + len_page0, len - len_page0, bytes + len_page0, xlate_flags);
+      load_slow_path_intrapage(len - len_page0, bytes + len_page0, generate_access_info(addr + len_page0, LOAD, xlate_flags));
   }
 
   check_triggers(triggers::OPERATION_LOAD, addr, reg_from_bytes(len, bytes));
diff --git a/riscv/mmu.h b/riscv/mmu.h
index 1039de1b99..1d38849fa4 100644
--- a/riscv/mmu.h
+++ b/riscv/mmu.h
@@ -379,7 +379,7 @@ class mmu_t
   // handle uncommon cases: TLB misses, page faults, MMIO
   tlb_entry_t fetch_slow_path(reg_t addr);
   void load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t xlate_flags);
-  void load_slow_path_intrapage(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t xlate_flags);
+  void load_slow_path_intrapage(reg_t len, uint8_t* bytes, mem_access_info_t access_info);
   void store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, xlate_flags_t xlate_flags, bool actually_store, bool require_alignment);
   void store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* bytes, xlate_flags_t xlate_flags, bool actually_store);
   bool mmio_fetch(reg_t paddr, size_t len, uint8_t* bytes);

From 9312137ae2a218632ec293ecc12da7c72fa828b2 Mon Sep 17 00:00:00 2001
From: rbuchner <ryan.buchner@arilinc.com>
Date: Mon, 24 Apr 2023 16:47:28 -0700
Subject: [PATCH 015/110] Use access_info.effective_virt when access_fault due
 to non-reservable lr

Fixes case 4 from https://github.com/riscv-software-src/riscv-isa-sim/issues/872
---
 riscv/mmu.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index 34cd1703a0..acbf652873 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -203,7 +203,7 @@ void mmu_t::load_slow_path_intrapage(reg_t len, uint8_t* bytes, mem_access_info_
   reg_t paddr = translate(access_info, len);
 
   if (access_info.flags.lr && !sim->reservable(paddr)) {
-    throw trap_load_access_fault((proc) ? proc->state.v : false, addr, 0, 0);
+    throw trap_load_access_fault(access_info.effective_virt, addr, 0, 0);
   }
 
   if (auto host_addr = sim->addr_to_mem(paddr)) {

From bd675766091549e4fc1607f6106b0dce7dc03d21 Mon Sep 17 00:00:00 2001
From: rbuchner <ryan.buchner@arilinc.com>
Date: Mon, 24 Apr 2023 16:59:34 -0700
Subject: [PATCH 016/110] Use access_info.effective_virt when failed mmio_load
 (i.e. device detects access fault)

Fixes case 3 from https://github.com/riscv-software-src/riscv-isa-sim/issues/872
---
 riscv/mmu.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index acbf652873..db6c31ea93 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -214,7 +214,7 @@ void mmu_t::load_slow_path_intrapage(reg_t len, uint8_t* bytes, mem_access_info_
       refill_tlb(addr, paddr, host_addr, LOAD);
 
   } else if (!mmio_load(paddr, len, bytes)) {
-    throw trap_load_access_fault((proc) ? proc->state.v : false, addr, 0, 0);
+    throw trap_load_access_fault(access_info.effective_virt, addr, 0, 0);
   }
 
   if (access_info.flags.lr) {

From 2745d3139cefd1fc2b97bb9382188c59f15eced9 Mon Sep 17 00:00:00 2001
From: Ryan Buchner <ryan.buchner@arilinc.com>
Date: Mon, 17 Apr 2023 20:22:59 -0700
Subject: [PATCH 017/110] Use access_info within load_slow_path rather than
 xlate_flags

Fixes case 2 from https://github.com/riscv-software-src/riscv-isa-sim/issues/872
---
 riscv/mmu.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index db6c31ea93..be986feae8 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -230,11 +230,11 @@ void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t
   if ((addr & (len - 1)) == 0) {
     load_slow_path_intrapage(len, bytes, access_info);
   } else {
-    bool gva = ((proc) ? proc->state.v : false) || xlate_flags.forced_virt;
+    bool gva = access_info.effective_virt;
     if (!is_misaligned_enabled())
       throw trap_load_address_misaligned(gva, addr, 0, 0);
 
-    if (xlate_flags.lr)
+    if (access_info.flags.lr)
       throw trap_load_access_fault(gva, addr, 0, 0);
 
     reg_t len_page0 = std::min(len, PGSIZE - addr % PGSIZE);

From 125c4d6a6400eef6365d8379efef1330c429f64e Mon Sep 17 00:00:00 2001
From: Ryan Buchner <ryan.buchner@arilinc.com>
Date: Mon, 17 Apr 2023 20:33:46 -0700
Subject: [PATCH 018/110] Adjust store_slow_path_intrapage to recieve a
 mem_access_info_t as input

---
 riscv/mmu.cc | 16 +++++++++-------
 riscv/mmu.h  |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index be986feae8..cf77325d66 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -246,10 +246,11 @@ void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t
   check_triggers(triggers::OPERATION_LOAD, addr, reg_from_bytes(len, bytes));
 }
 
-void mmu_t::store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* bytes, xlate_flags_t xlate_flags, bool actually_store)
+void mmu_t::store_slow_path_intrapage(reg_t len, const uint8_t* bytes, mem_access_info_t access_info, bool actually_store)
 {
+  reg_t addr = access_info.vaddr;
   reg_t vpn = addr >> PGSHIFT;
-  if (!xlate_flags.is_special_access() && vpn == (tlb_store_tag[vpn % TLB_ENTRIES] & ~TLB_CHECK_TRIGGERS)) {
+  if (!access_info.flags.is_special_access() && vpn == (tlb_store_tag[vpn % TLB_ENTRIES] & ~TLB_CHECK_TRIGGERS)) {
     if (actually_store) {
       auto host_addr = tlb_data[vpn % TLB_ENTRIES].host_offset + addr;
       memcpy(host_addr, bytes, len);
@@ -257,14 +258,14 @@ void mmu_t::store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* byte
     return;
   }
 
-  reg_t paddr = translate(generate_access_info(addr, STORE, xlate_flags), len);
+  reg_t paddr = translate(access_info, len);
 
   if (actually_store) {
     if (auto host_addr = sim->addr_to_mem(paddr)) {
       memcpy(host_addr, bytes, len);
       if (tracer.interested_in_range(paddr, paddr + PGSIZE, STORE))
         tracer.trace(paddr, len, STORE);
-      else if (!xlate_flags.is_special_access())
+      else if (!access_info.flags.is_special_access())
         refill_tlb(addr, paddr, host_addr, STORE);
     } else if (!mmio_store(paddr, len, bytes)) {
       throw trap_store_access_fault((proc) ? proc->state.v : false, addr, 0, 0);
@@ -274,6 +275,7 @@ void mmu_t::store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* byte
 
 void mmu_t::store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, xlate_flags_t xlate_flags, bool actually_store, bool UNUSED require_alignment)
 {
+  auto access_info = generate_access_info(addr, STORE, xlate_flags);
   if (actually_store)
     check_triggers(triggers::OPERATION_STORE, addr, reg_from_bytes(len, bytes));
 
@@ -286,11 +288,11 @@ void mmu_t::store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, xlate_f
       throw trap_store_access_fault(gva, addr, 0, 0);
 
     reg_t len_page0 = std::min(len, PGSIZE - addr % PGSIZE);
-    store_slow_path_intrapage(addr, len_page0, bytes, xlate_flags, actually_store);
+    store_slow_path_intrapage(len_page0, bytes, access_info, actually_store);
     if (len_page0 != len)
-      store_slow_path_intrapage(addr + len_page0, len - len_page0, bytes + len_page0, xlate_flags, actually_store);
+      store_slow_path_intrapage(len - len_page0, bytes + len_page0, generate_access_info(addr + len_page0, STORE, xlate_flags), actually_store);
   } else {
-    store_slow_path_intrapage(addr, len, bytes, xlate_flags, actually_store);
+    store_slow_path_intrapage(len, bytes, access_info, actually_store);
   }
 }
 
diff --git a/riscv/mmu.h b/riscv/mmu.h
index 1d38849fa4..41f6751a64 100644
--- a/riscv/mmu.h
+++ b/riscv/mmu.h
@@ -381,7 +381,7 @@ class mmu_t
   void load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t xlate_flags);
   void load_slow_path_intrapage(reg_t len, uint8_t* bytes, mem_access_info_t access_info);
   void store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, xlate_flags_t xlate_flags, bool actually_store, bool require_alignment);
-  void store_slow_path_intrapage(reg_t addr, reg_t len, const uint8_t* bytes, xlate_flags_t xlate_flags, bool actually_store);
+  void store_slow_path_intrapage(reg_t len, const uint8_t* bytes, mem_access_info_t access_info, bool actually_store);
   bool mmio_fetch(reg_t paddr, size_t len, uint8_t* bytes);
   bool mmio_load(reg_t paddr, size_t len, uint8_t* bytes);
   bool mmio_store(reg_t paddr, size_t len, const uint8_t* bytes);

From 850600792ec04756f7720a9b376cfb2d8ad6c917 Mon Sep 17 00:00:00 2001
From: rbuchner <ryan.buchner@arilinc.com>
Date: Mon, 1 May 2023 12:42:28 -0700
Subject: [PATCH 019/110] Use access_info.effective_virt when failed mmio_store
 (i.e. device detects access fault)

Fixes case 3 from https://github.com/riscv-software-src/riscv-isa-sim/issues/872
---
 riscv/mmu.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index cf77325d66..a52403522a 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -268,7 +268,7 @@ void mmu_t::store_slow_path_intrapage(reg_t len, const uint8_t* bytes, mem_acces
       else if (!access_info.flags.is_special_access())
         refill_tlb(addr, paddr, host_addr, STORE);
     } else if (!mmio_store(paddr, len, bytes)) {
-      throw trap_store_access_fault((proc) ? proc->state.v : false, addr, 0, 0);
+      throw trap_store_access_fault(access_info.effective_virt, addr, 0, 0);
     }
   }
 }

From f7900e4730e1c13fa42789bc01d8f0366756130e Mon Sep 17 00:00:00 2001
From: Ryan Buchner <ryan.buchner@arilinc.com>
Date: Mon, 17 Apr 2023 20:34:29 -0700
Subject: [PATCH 020/110] Use access_info within store_slow_path rather than
 xlate_flags

---
 riscv/mmu.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index a52403522a..7264ea8b92 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -280,7 +280,7 @@ void mmu_t::store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, xlate_f
     check_triggers(triggers::OPERATION_STORE, addr, reg_from_bytes(len, bytes));
 
   if (addr & (len - 1)) {
-    bool gva = ((proc) ? proc->state.v : false) || xlate_flags.forced_virt;
+    bool gva = access_info.effective_virt;
     if (!is_misaligned_enabled())
       throw trap_store_address_misaligned(gva, addr, 0, 0);
 

From 4b9996bad9a3327b13056f21b7b2e03fdc41f65a Mon Sep 17 00:00:00 2001
From: rbuchner <ryan.buchner@arilinc.com>
Date: Tue, 18 Apr 2023 14:06:12 -0700
Subject: [PATCH 021/110] Pass mem_access_info_t into walk()

---
 riscv/mmu.cc | 10 +++++++---
 riscv/mmu.h  |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index 7264ea8b92..f40ce30727 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -60,10 +60,9 @@ reg_t mmu_t::translate(mem_access_info_t access_info, reg_t len)
     return addr;
 
   bool virt = access_info.effective_virt;
-  bool hlvx = access_info.flags.hlvx;
   reg_t mode = (reg_t) access_info.effective_priv;
 
-  reg_t paddr = walk(addr, type, mode, virt, hlvx) | (addr & (PGSIZE-1));
+  reg_t paddr = walk(access_info) | (addr & (PGSIZE-1));
   if (!pmp_ok(paddr, len, type, mode))
     throw_access_exception(virt, addr, type);
   return paddr;
@@ -461,8 +460,13 @@ reg_t mmu_t::s2xlate(reg_t gva, reg_t gpa, access_type type, access_type trap_ty
   }
 }
 
-reg_t mmu_t::walk(reg_t addr, access_type type, reg_t mode, bool virt, bool hlvx)
+reg_t mmu_t::walk(mem_access_info_t access_info)
 {
+  access_type type = access_info.type;
+  reg_t addr = access_info.vaddr;
+  bool virt = access_info.effective_virt;
+  bool hlvx = access_info.flags.hlvx;
+  reg_t mode = access_info.effective_priv;
   reg_t page_mask = (reg_t(1) << PGSHIFT) - 1;
   reg_t satp = proc->get_state()->satp->readvirt(virt);
   vm_info vm = decode_vm_info(proc->get_const_xlen(), false, mode, satp);
diff --git a/riscv/mmu.h b/riscv/mmu.h
index 41f6751a64..7d14ad528f 100644
--- a/riscv/mmu.h
+++ b/riscv/mmu.h
@@ -374,7 +374,7 @@ class mmu_t
   reg_t s2xlate(reg_t gva, reg_t gpa, access_type type, access_type trap_type, bool virt, bool hlvx);
 
   // perform a page table walk for a given VA; set referenced/dirty bits
-  reg_t walk(reg_t addr, access_type type, reg_t prv, bool virt, bool hlvx);
+  reg_t walk(mem_access_info_t access_info);
 
   // handle uncommon cases: TLB misses, page faults, MMIO
   tlb_entry_t fetch_slow_path(reg_t addr);

From 36b8c12e9f4d92f3cb97daf4ea0613436724438f Mon Sep 17 00:00:00 2001
From: rbuchner <ryan.buchner@arilinc.com>
Date: Wed, 19 Apr 2023 16:20:56 -0700
Subject: [PATCH 022/110] Add split_misaligned_access() to mem_access_info_t

---
 riscv/mmu.cc | 4 ++--
 riscv/mmu.h  | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index f40ce30727..734e8cd991 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -239,7 +239,7 @@ void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t
     reg_t len_page0 = std::min(len, PGSIZE - addr % PGSIZE);
     load_slow_path_intrapage(len_page0, bytes, access_info);
     if (len_page0 != len)
-      load_slow_path_intrapage(len - len_page0, bytes + len_page0, generate_access_info(addr + len_page0, LOAD, xlate_flags));
+      load_slow_path_intrapage(len - len_page0, bytes + len_page0, access_info.split_misaligned_access(len_page0));
   }
 
   check_triggers(triggers::OPERATION_LOAD, addr, reg_from_bytes(len, bytes));
@@ -289,7 +289,7 @@ void mmu_t::store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, xlate_f
     reg_t len_page0 = std::min(len, PGSIZE - addr % PGSIZE);
     store_slow_path_intrapage(len_page0, bytes, access_info, actually_store);
     if (len_page0 != len)
-      store_slow_path_intrapage(len - len_page0, bytes + len_page0, generate_access_info(addr + len_page0, STORE, xlate_flags), actually_store);
+      store_slow_path_intrapage(len - len_page0, bytes + len_page0, access_info.split_misaligned_access(len_page0), actually_store);
   } else {
     store_slow_path_intrapage(len, bytes, access_info, actually_store);
   }
diff --git a/riscv/mmu.h b/riscv/mmu.h
index 7d14ad528f..2f93863261 100644
--- a/riscv/mmu.h
+++ b/riscv/mmu.h
@@ -54,6 +54,10 @@ struct mem_access_info_t {
   const bool effective_virt;
   const xlate_flags_t flags;
   const access_type type;
+
+  mem_access_info_t split_misaligned_access(reg_t offset) const {
+    return {vaddr + offset, effective_priv, effective_virt, flags, type};
+  }
 };
 
 void throw_access_exception(bool virt, reg_t addr, access_type type);

From 33fbc2df39df914d3462bede4112db7966d49a3c Mon Sep 17 00:00:00 2001
From: rbuchner <ryan.buchner@arilinc.com>
Date: Fri, 28 Apr 2023 16:28:16 -0700
Subject: [PATCH 023/110] Plumb in effective virtual bit to
 take_trigger_action()

---
 riscv/execute.cc   |  6 +++---
 riscv/mmu.cc       | 16 ++++++++--------
 riscv/mmu.h        |  2 +-
 riscv/processor.cc |  2 +-
 riscv/processor.h  |  2 +-
 riscv/triggers.h   |  5 +++--
 6 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/riscv/execute.cc b/riscv/execute.cc
index acf0e908c6..295879d4db 100644
--- a/riscv/execute.cc
+++ b/riscv/execute.cc
@@ -267,7 +267,7 @@ void processor_t::step(size_t n)
             auto match = TM.detect_icount_match();
             if (match.has_value()) {
               assert(match->timing == triggers::TIMING_BEFORE);
-              throw triggers::matched_t((triggers::operation_t)0, 0, match->action);
+              throw triggers::matched_t((triggers::operation_t)0, 0, match->action, state.v);
             }
           }
 
@@ -310,7 +310,7 @@ void processor_t::step(size_t n)
       // Trigger action takes priority over single step
       auto match = TM.detect_trap_match(t);
       if (match.has_value())
-        take_trigger_action(match->action, 0, state.pc);
+        take_trigger_action(match->action, 0, state.pc, 0);
       else if (unlikely(state.single_step == state.STEP_STEPPED)) {
         state.single_step = state.STEP_NONE;
         enter_debug_mode(DCSR_CAUSE_STEP);
@@ -322,7 +322,7 @@ void processor_t::step(size_t n)
         delete mmu->matched_trigger;
         mmu->matched_trigger = NULL;
       }
-      take_trigger_action(t.action, t.address, pc);
+      take_trigger_action(t.action, t.address, pc, t.gva);
     }
     catch(trap_debug_mode&)
     {
diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index 734e8cd991..358ccd3e42 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -71,7 +71,7 @@ reg_t mmu_t::translate(mem_access_info_t access_info, reg_t len)
 tlb_entry_t mmu_t::fetch_slow_path(reg_t vaddr)
 {
   auto access_info = generate_access_info(vaddr, FETCH, {false, false, false});
-  check_triggers(triggers::OPERATION_EXECUTE, vaddr);
+  check_triggers(triggers::OPERATION_EXECUTE, vaddr, access_info.effective_virt);
 
   tlb_entry_t result;
   reg_t vpn = vaddr >> PGSHIFT;
@@ -88,7 +88,7 @@ tlb_entry_t mmu_t::fetch_slow_path(reg_t vaddr)
     result = tlb_data[vpn % TLB_ENTRIES];
   }
 
-  check_triggers(triggers::OPERATION_EXECUTE, vaddr, from_le(*(const uint16_t*)(result.host_offset + vaddr)));
+  check_triggers(triggers::OPERATION_EXECUTE, vaddr, access_info.effective_virt, from_le(*(const uint16_t*)(result.host_offset + vaddr)));
 
   return result;
 }
@@ -169,7 +169,7 @@ bool mmu_t::mmio(reg_t paddr, size_t len, uint8_t* bytes, access_type type)
   return true;
 }
 
-void mmu_t::check_triggers(triggers::operation_t operation, reg_t address, std::optional<reg_t> data)
+void mmu_t::check_triggers(triggers::operation_t operation, reg_t address, bool virt, std::optional<reg_t> data)
 {
   if (matched_trigger || !proc)
     return;
@@ -179,13 +179,13 @@ void mmu_t::check_triggers(triggers::operation_t operation, reg_t address, std::
   if (match.has_value())
     switch (match->timing) {
       case triggers::TIMING_BEFORE:
-        throw triggers::matched_t(operation, address, match->action);
+        throw triggers::matched_t(operation, address, match->action, virt);
 
       case triggers::TIMING_AFTER:
         // We want to take this exception on the next instruction.  We check
         // whether to do so in the I$ refill path, so flush the I$.
         flush_icache();
-        matched_trigger = new triggers::matched_t(operation, address, match->action);
+        matched_trigger = new triggers::matched_t(operation, address, match->action, virt);
     }
 }
 
@@ -224,7 +224,7 @@ void mmu_t::load_slow_path_intrapage(reg_t len, uint8_t* bytes, mem_access_info_
 void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t xlate_flags)
 {
   auto access_info = generate_access_info(addr, LOAD, xlate_flags);
-  check_triggers(triggers::OPERATION_LOAD, addr);
+  check_triggers(triggers::OPERATION_LOAD, addr, access_info.effective_virt);
 
   if ((addr & (len - 1)) == 0) {
     load_slow_path_intrapage(len, bytes, access_info);
@@ -242,7 +242,7 @@ void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t
       load_slow_path_intrapage(len - len_page0, bytes + len_page0, access_info.split_misaligned_access(len_page0));
   }
 
-  check_triggers(triggers::OPERATION_LOAD, addr, reg_from_bytes(len, bytes));
+  check_triggers(triggers::OPERATION_LOAD, addr, access_info.effective_virt, reg_from_bytes(len, bytes));
 }
 
 void mmu_t::store_slow_path_intrapage(reg_t len, const uint8_t* bytes, mem_access_info_t access_info, bool actually_store)
@@ -276,7 +276,7 @@ void mmu_t::store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, xlate_f
 {
   auto access_info = generate_access_info(addr, STORE, xlate_flags);
   if (actually_store)
-    check_triggers(triggers::OPERATION_STORE, addr, reg_from_bytes(len, bytes));
+    check_triggers(triggers::OPERATION_STORE, addr, access_info.effective_virt, reg_from_bytes(len, bytes));
 
   if (addr & (len - 1)) {
     bool gva = access_info.effective_virt;
diff --git a/riscv/mmu.h b/riscv/mmu.h
index 2f93863261..5a4835c3ce 100644
--- a/riscv/mmu.h
+++ b/riscv/mmu.h
@@ -391,7 +391,7 @@ class mmu_t
   bool mmio_store(reg_t paddr, size_t len, const uint8_t* bytes);
   bool mmio(reg_t paddr, size_t len, uint8_t* bytes, access_type type);
   bool mmio_ok(reg_t paddr, access_type type);
-  void check_triggers(triggers::operation_t operation, reg_t address, std::optional<reg_t> data = std::nullopt);
+  void check_triggers(triggers::operation_t operation, reg_t address, bool virt, std::optional<reg_t> data = std::nullopt);
   reg_t translate(mem_access_info_t access_info, reg_t len);
 
   reg_t pte_load(reg_t pte_paddr, reg_t addr, bool virt, access_type trap_type, size_t ptesize) {
diff --git a/riscv/processor.cc b/riscv/processor.cc
index 330bd30c7b..0ccb651700 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -885,7 +885,7 @@ void processor_t::take_trap(trap_t& t, reg_t epc)
   }
 }
 
-void processor_t::take_trigger_action(triggers::action_t action, reg_t breakpoint_tval, reg_t epc)
+void processor_t::take_trigger_action(triggers::action_t action, reg_t breakpoint_tval, reg_t epc, bool virt)
 {
   if (debug) {
     std::stringstream s; // first put everything in a string, later send it to output
diff --git a/riscv/processor.h b/riscv/processor.h
index 8117568c68..1b74cc2707 100644
--- a/riscv/processor.h
+++ b/riscv/processor.h
@@ -331,7 +331,7 @@ class processor_t : public abstract_device_t
   void take_pending_interrupt() { take_interrupt(state.mip->read() & state.mie->read()); }
   void take_interrupt(reg_t mask); // take first enabled interrupt in mask
   void take_trap(trap_t& t, reg_t epc); // take an exception
-  void take_trigger_action(triggers::action_t action, reg_t breakpoint_tval, reg_t epc);
+  void take_trigger_action(triggers::action_t action, reg_t breakpoint_tval, reg_t epc, bool virt);
   void disasm(insn_t insn); // disassemble and print an instruction
   int paddr_bits();
 
diff --git a/riscv/triggers.h b/riscv/triggers.h
index 6e3d74d8c1..aeda4d581a 100644
--- a/riscv/triggers.h
+++ b/riscv/triggers.h
@@ -54,12 +54,13 @@ struct match_result_t {
 class matched_t
 {
   public:
-    matched_t(triggers::operation_t operation, reg_t address, action_t action) :
-      operation(operation), address(address), action(action) {}
+    matched_t(triggers::operation_t operation, reg_t address, action_t action, bool gva) :
+      operation(operation), address(address), action(action), gva(gva) {}
 
     triggers::operation_t operation;
     reg_t address;
     action_t action;
+    bool gva;
 };
 
 class trigger_t {

From a30a0d63677151cc688fa4e0a05ac664e63d94f4 Mon Sep 17 00:00:00 2001
From: rbuchner <ryan.buchner@arilinc.com>
Date: Mon, 1 May 2023 09:13:47 -0700
Subject: [PATCH 024/110] Use passed in virtual bit for creating traps in
 take_trigger_action() rahter than state.v

Fixes case 1 from https://github.com/riscv-software-src/riscv-isa-sim/issues/872
---
 riscv/processor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/riscv/processor.cc b/riscv/processor.cc
index 0ccb651700..74a0b8fea3 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -899,7 +899,7 @@ void processor_t::take_trigger_action(triggers::action_t action, reg_t breakpoin
       enter_debug_mode(DCSR_CAUSE_HWBP);
       break;
     case triggers::ACTION_DEBUG_EXCEPTION: {
-      trap_breakpoint trap(state.v, breakpoint_tval);
+      trap_breakpoint trap(virt, breakpoint_tval);
       take_trap(trap, epc);
       break;
     }

From 7dbd0949c957704a402e37af3798038f2b41a360 Mon Sep 17 00:00:00 2001
From: Wojciech Bartczak <wbartczak@marvell.com>
Date: Tue, 2 May 2023 06:34:51 -0700
Subject: [PATCH 025/110] Add Spike's meta files for pkg-config

This commit adds the *.pc files for Spike's simulation library,
enabling dynamic and static linking without the need to directly
reference Spike sources. Using Spike as a stand-alone library
provides an interesting option for developing tools
and applications based on Spike.
---
 Makefile.in         |  2 +-
 configure           |  3 +++
 configure.ac        |  1 +
 disasm/disasm.mk.in |  2 ++
 fesvr/fesvr.mk.in   |  2 ++
 riscv-riscv.pc.in   | 11 +++++++++++
 riscv/riscv.mk.in   |  2 ++
 7 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 riscv-riscv.pc.in

diff --git a/Makefile.in b/Makefile.in
index c3e1822d3b..c922e849bd 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -337,7 +337,7 @@ install_hdrs += $$(addprefix $(src_dir)/$(1)/, $$($(2)_install_hdrs))
 install_libs += $$(if $$($(2)_install_lib),lib$(1).a,)
 install_libs += $$(if $$($(2)_install_shared_lib),lib$(1).so,)
 install_exes += $$($(2)_install_prog_exes)
-install_pcs  += $$(if $$($(2)_install_lib),riscv-$(1).pc,)
+install_pcs  += $$(if $$($(2)_install_pcs),riscv-$(1).pc,)
 
 endef
 
diff --git a/configure b/configure
index 419132f75a..93a4302e90 100755
--- a/configure
+++ b/configure
@@ -6362,6 +6362,8 @@ ac_config_files="$ac_config_files riscv-fesvr.pc"
 
 ac_config_files="$ac_config_files riscv-disasm.pc"
 
+ac_config_files="$ac_config_files riscv-riscv.pc"
+
 cat >confcache <<\_ACEOF
 # This file is a shell script that caches the results of configure
 # tests run on this system so they can be shared between configure
@@ -7065,6 +7067,7 @@ do
     "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
     "riscv-fesvr.pc") CONFIG_FILES="$CONFIG_FILES riscv-fesvr.pc" ;;
     "riscv-disasm.pc") CONFIG_FILES="$CONFIG_FILES riscv-disasm.pc" ;;
+    "riscv-riscv.pc") CONFIG_FILES="$CONFIG_FILES riscv-riscv.pc" ;;
 
   *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
   esac
diff --git a/configure.ac b/configure.ac
index 13797a0b2e..1b46578b00 100644
--- a/configure.ac
+++ b/configure.ac
@@ -123,4 +123,5 @@ AC_CONFIG_HEADERS([config.h])
 AC_CONFIG_FILES([Makefile])
 AC_CONFIG_FILES([riscv-fesvr.pc])
 AC_CONFIG_FILES([riscv-disasm.pc])
+AC_CONFIG_FILES([riscv-riscv.pc])
 AC_OUTPUT
diff --git a/disasm/disasm.mk.in b/disasm/disasm.mk.in
index 9eafb12f9b..445c430f55 100644
--- a/disasm/disasm.mk.in
+++ b/disasm/disasm.mk.in
@@ -3,3 +3,5 @@ disasm_srcs = \
   regnames.cc \
 
 disasm_install_lib = yes
+
+disasm_install_pcs = yes
diff --git a/fesvr/fesvr.mk.in b/fesvr/fesvr.mk.in
index e0d143f312..f4b154d8db 100644
--- a/fesvr/fesvr.mk.in
+++ b/fesvr/fesvr.mk.in
@@ -19,6 +19,8 @@ fesvr_install_config_hdr = yes
 
 fesvr_install_lib = yes
 
+fesvr_install_pcs = yes
+
 fesvr_srcs = \
   elfloader.cc \
   htif.cc \
diff --git a/riscv-riscv.pc.in b/riscv-riscv.pc.in
new file mode 100644
index 0000000000..852ac830ec
--- /dev/null
+++ b/riscv-riscv.pc.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: riscv-riscv
+Description: RISC-V
+Version: git
+Libs: -Wl,-rpath,${libdir} -L${libdir} -lriscv
+Cflags: -I${includedir}
+URL: http://riscv.org/download.html#tab_spike
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 561e197112..55fadc0258 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -11,6 +11,8 @@ riscv_CFLAGS = -fPIC
 
 riscv_install_shared_lib = yes
 
+riscv_install_pcs = yes
+
 riscv_install_prog_srcs = \
 
 riscv_install_hdrs = \

From 4dfdf00f13333ae9f59f2c98a1e56bfc33572701 Mon Sep 17 00:00:00 2001
From: YenHaoChen <howard25336284@gmail.com>
Date: Mon, 22 May 2023 15:02:56 +0800
Subject: [PATCH 026/110] Let mstatus.MPP initially be M-mode if unsupporting
 U-mode

This commit lets the mstatus.MPP be a valid value if unsupporting
U-mode. Without this commit, the mret may result in a corrupted state
without properly setting the MPP to M-mode (if unsupporting U-mode).
---
 riscv/csrs.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/riscv/csrs.cc b/riscv/csrs.cc
index 2e01983b64..396f42fedc 100644
--- a/riscv/csrs.cc
+++ b/riscv/csrs.cc
@@ -510,6 +510,7 @@ reg_t mstatus_csr_t::compute_mstatus_initial_value() const noexcept {
                               | (proc->extension_enabled_const('S') ? MSTATUS_SBE : 0)
                               | MSTATUS_MBE;
   return 0
+         | set_field((reg_t)0, MSTATUS_MPP, proc->extension_enabled_const('U') ? PRV_U : PRV_M)
          | (proc->extension_enabled_const('U') && (proc->get_const_xlen() != 32) ? set_field((reg_t)0, MSTATUS_UXL, xlen_to_uxl(proc->get_const_xlen())) : 0)
          | (proc->extension_enabled_const('S') && (proc->get_const_xlen() != 32) ? set_field((reg_t)0, MSTATUS_SXL, xlen_to_uxl(proc->get_const_xlen())) : 0)
          | (proc->get_mmu()->is_target_big_endian() ? big_endian_bits : 0)

From fb57d7ce040fbc07197ee76e2afc46cb42cce5e3 Mon Sep 17 00:00:00 2001
From: Atul Khare <atulkhare@rivosinc.com>
Date: Mon, 22 May 2023 14:10:47 -0700
Subject: [PATCH 027/110] Add pre_v to processor state

This adds the prev_v field to track the previous virtual mode state. We
also assign it unconditionally to handle cases for trigger matching
like the following (pointed out by Scott Johnson):

    1) SRET from HS to VU: prev_v is set to 0
    2) Trap from VU to VS: state.v/prev_v won't be assigned because of
       unchanged v, and remain 0.
    3) An etrigger that's set to break on a VU-mode trap won't match
       properly because prev_v is incorrect

This be used in a forthcoming patch for trigger matching.
---
 riscv/processor.cc | 23 +++++++++++------------
 riscv/processor.h  |  1 +
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/riscv/processor.cc b/riscv/processor.cc
index 74a0b8fea3..fce2c5c107 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -197,7 +197,7 @@ void state_t::reset(processor_t* const proc, reg_t max_isa)
   auto xlen = proc->get_isa().get_max_xlen();
 
   prv = PRV_M;
-  v = false;
+  v = prev_v = false;
   csrmap[CSR_MISA] = misa = std::make_shared<misa_csr_t>(proc, CSR_MISA, max_isa);
   mstatus = std::make_shared<mstatus_csr_t>(proc, CSR_MSTATUS);
 
@@ -747,17 +747,16 @@ void processor_t::set_virt(bool virt)
   if (state.prv == PRV_M)
     return;
 
-  if (state.v != virt) {
-    /*
-     * Ideally, we should flush TLB here but we don't need it because
-     * set_virt() is always used in conjucter with set_privilege() and
-     * set_privilege() will flush TLB unconditionally.
-     *
-     * The virtualized sstatus register also relies on this TLB flush,
-     * since changing V might change sstatus.MXR and sstatus.SUM.
-     */
-    state.v = virt;
-  }
+  /*
+    * Ideally, we should flush TLB here but we don't need it because
+    * set_virt() is always used in conjucter with set_privilege() and
+    * set_privilege() will flush TLB unconditionally.
+    *
+    * The virtualized sstatus register also relies on this TLB flush,
+    * since changing V might change sstatus.MXR and sstatus.SUM.
+    */
+  state.prev_v = state.v;
+  state.v = virt;
 }
 
 void processor_t::enter_debug_mode(uint8_t cause)
diff --git a/riscv/processor.h b/riscv/processor.h
index 1b74cc2707..93e10f35c1 100644
--- a/riscv/processor.h
+++ b/riscv/processor.h
@@ -84,6 +84,7 @@ struct state_t
   std::unordered_map<reg_t, csr_t_p> csrmap;
   reg_t prv;    // TODO: Can this be an enum instead?
   bool v;
+  bool prev_v;
   misa_csr_t_p misa;
   mstatus_csr_t_p mstatus;
   csr_t_p mstatush;

From ddae0f25a89f2d04d76a222505d00c39dc511505 Mon Sep 17 00:00:00 2001
From: Atul Khare <atulkhare@rivosinc.com>
Date: Mon, 22 May 2023 14:16:03 -0700
Subject: [PATCH 028/110] Add prev_prv to processor state

This adds the prev_prv field to track the previous privilege. It will
be used in a forthcoming patch for trigger matching.
---
 riscv/processor.cc | 3 ++-
 riscv/processor.h  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/riscv/processor.cc b/riscv/processor.cc
index fce2c5c107..23284b8743 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -196,7 +196,7 @@ void state_t::reset(processor_t* const proc, reg_t max_isa)
   // mstatus_csr_t::unlogged_write()):
   auto xlen = proc->get_isa().get_max_xlen();
 
-  prv = PRV_M;
+  prv = prev_prv = PRV_M;
   v = prev_v = false;
   csrmap[CSR_MISA] = misa = std::make_shared<misa_csr_t>(proc, CSR_MISA, max_isa);
   mstatus = std::make_shared<mstatus_csr_t>(proc, CSR_MSTATUS);
@@ -717,6 +717,7 @@ reg_t processor_t::legalize_privilege(reg_t prv)
 void processor_t::set_privilege(reg_t prv)
 {
   mmu->flush_tlb();
+  state.prev_prv = state.prv;
   state.prv = legalize_privilege(prv);
 }
 
diff --git a/riscv/processor.h b/riscv/processor.h
index 93e10f35c1..34354c22d7 100644
--- a/riscv/processor.h
+++ b/riscv/processor.h
@@ -83,6 +83,7 @@ struct state_t
   // control and status registers
   std::unordered_map<reg_t, csr_t_p> csrmap;
   reg_t prv;    // TODO: Can this be an enum instead?
+  reg_t prev_prv;
   bool v;
   bool prev_v;
   misa_csr_t_p misa;

From 31f5ede662303183d93f80e869379e49b7a01608 Mon Sep 17 00:00:00 2001
From: Atul Khare <atulkhare@rivosinc.com>
Date: Wed, 17 May 2023 12:40:01 -0700
Subject: [PATCH 029/110] Enhance mode_match() functionality

The current version of mode_match() is based on the current privilege
level. This adds an explicit privilege and virtual mode parameters in
anticipation of an upcoming patch for matching trap triggers.
---
 riscv/triggers.cc | 11 ++++++-----
 riscv/triggers.h  |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/riscv/triggers.cc b/riscv/triggers.cc
index 86dcc81a11..51dcf18824 100644
--- a/riscv/triggers.cc
+++ b/riscv/triggers.cc
@@ -56,15 +56,16 @@ void trigger_t::tdata3_write(processor_t * const proc, const reg_t val) noexcept
 }
 
 bool trigger_t::common_match(processor_t * const proc) const noexcept {
-  return mode_match(proc->get_state()) && textra_match(proc);
+  auto state = proc->get_state();
+  return mode_match(state->prv, state->v) && textra_match(proc);
 }
 
-bool trigger_t::mode_match(state_t * const state) const noexcept
+bool trigger_t::mode_match(reg_t prv, bool v) const noexcept
 {
-  switch (state->prv) {
+  switch (prv) {
     case PRV_M: return m;
-    case PRV_S: return state->v ? vs : s;
-    case PRV_U: return state->v ? vu : u;
+    case PRV_S: return v ? vs : s;
+    case PRV_U: return v ? vu : u;
     default: assert(false);
   }
 }
diff --git a/riscv/triggers.h b/riscv/triggers.h
index aeda4d581a..94e7e5cee2 100644
--- a/riscv/triggers.h
+++ b/riscv/triggers.h
@@ -102,7 +102,7 @@ class trigger_t {
 
 private:
   unsigned legalize_mhselect(bool h_enabled) const noexcept;
-  bool mode_match(state_t * const state) const noexcept;
+  bool mode_match(reg_t prv, bool v) const noexcept;
   bool textra_match(processor_t * const proc) const noexcept;
 
   struct mhselect_interpretation {

From d9e30bb6970c3e387d719b3fd0808937889ca3a6 Mon Sep 17 00:00:00 2001
From: Atul Khare <atulkhare@rivosinc.com>
Date: Tue, 16 May 2023 15:11:44 -0700
Subject: [PATCH 030/110] triggers: Fix etrigger match on exceptions

The etrigger match on exceptions doesn't work properly in cases like
the following:

1) M-mode delegates ECALLs to S-mode
2) A CPU hardware point mechanism is used to place a breakpoint on the
   Umode instruction that executes the ECALL from Umode to Smode. In
   effect, this creates a breakpoint etrigger based on Umode.

In the above, the expectation is that #2 will first cause an exit to
the Smode handler (stvec), and the hardware breakpoint exception will
be triggered following an entry into the handler.

However, since etrigger currently checks the current privilege mode, we
will never get a match on conditions like #2.

The patch attempts to address the issue by using the stashed version of
the previous privilege mode for the etrigger match.

cc: YenHaoChen <howard25336284@gmail.com>
Signed-off-by: Atul Khare <atulkhare@rivosinc.com>
---
 riscv/triggers.cc | 9 ++++++---
 riscv/triggers.h  | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/riscv/triggers.cc b/riscv/triggers.cc
index 51dcf18824..65ba4c9b0b 100644
--- a/riscv/triggers.cc
+++ b/riscv/triggers.cc
@@ -55,9 +55,11 @@ void trigger_t::tdata3_write(processor_t * const proc, const reg_t val) noexcept
   sselect = (sselect_t)((proc->extension_enabled_const('S') && get_field(val, CSR_TEXTRA_SSELECT(xlen)) <= SSELECT_MAXVAL) ? get_field(val, CSR_TEXTRA_SSELECT(xlen)) : SSELECT_IGNORE);
 }
 
-bool trigger_t::common_match(processor_t * const proc) const noexcept {
+bool trigger_t::common_match(processor_t * const proc, bool use_prev_prv) const noexcept {
   auto state = proc->get_state();
-  return mode_match(state->prv, state->v) && textra_match(proc);
+  auto prv = use_prev_prv ? state->prev_prv : state->prv;
+  auto v = use_prev_prv ? state->prev_v : state->v;
+  return mode_match(prv, v) && textra_match(proc);
 }
 
 bool trigger_t::mode_match(reg_t prv, bool v) const noexcept
@@ -398,7 +400,8 @@ void itrigger_t::tdata1_write(processor_t * const proc, const reg_t val, const b
 
 std::optional<match_result_t> trap_common_t::detect_trap_match(processor_t * const proc, const trap_t& t) noexcept
 {
-  if (!common_match(proc))
+  // Use the previous privilege for matching
+  if (!common_match(proc, true))
     return std::nullopt;
 
   auto xlen = proc->get_xlen();
diff --git a/riscv/triggers.h b/riscv/triggers.h
index 94e7e5cee2..0bf6097a99 100644
--- a/riscv/triggers.h
+++ b/riscv/triggers.h
@@ -90,7 +90,7 @@ class trigger_t {
 
 protected:
   static action_t legalize_action(reg_t val, reg_t action_mask, reg_t dmode_mask) noexcept;
-  bool common_match(processor_t * const proc) const noexcept;
+  bool common_match(processor_t * const proc, bool use_prev_prv = false) const noexcept;
   bool allow_action(const state_t * const state) const;
   reg_t tdata2;
 

From e910707051c4e10889f58229443bf9d41652ed7b Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Thu, 25 May 2023 14:33:52 -0700
Subject: [PATCH 031/110] Use more descriptive variable name in
 dcsr_csr_t::read; make it reg_t

---
 riscv/csrs.cc | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/riscv/csrs.cc b/riscv/csrs.cc
index 396f42fedc..9a165eba3e 100644
--- a/riscv/csrs.cc
+++ b/riscv/csrs.cc
@@ -1244,18 +1244,18 @@ void dcsr_csr_t::verify_permissions(insn_t insn, bool write) const {
 }
 
 reg_t dcsr_csr_t::read() const noexcept {
-  uint32_t v = 0;
-  v = set_field(v, DCSR_XDEBUGVER, 1);
-  v = set_field(v, DCSR_EBREAKM, ebreakm);
-  v = set_field(v, DCSR_EBREAKH, ebreakh);
-  v = set_field(v, DCSR_EBREAKS, ebreaks);
-  v = set_field(v, DCSR_EBREAKU, ebreaku);
-  v = set_field(v, DCSR_STOPCYCLE, 0);
-  v = set_field(v, DCSR_STOPTIME, 0);
-  v = set_field(v, DCSR_CAUSE, cause);
-  v = set_field(v, DCSR_STEP, step);
-  v = set_field(v, DCSR_PRV, prv);
-  return v;
+  reg_t result = 0;
+  result = set_field(result, DCSR_XDEBUGVER, 1);
+  result = set_field(result, DCSR_EBREAKM, ebreakm);
+  result = set_field(result, DCSR_EBREAKH, ebreakh);
+  result = set_field(result, DCSR_EBREAKS, ebreaks);
+  result = set_field(result, DCSR_EBREAKU, ebreaku);
+  result = set_field(result, DCSR_STOPCYCLE, 0);
+  result = set_field(result, DCSR_STOPTIME, 0);
+  result = set_field(result, DCSR_CAUSE, cause);
+  result = set_field(result, DCSR_STEP, step);
+  result = set_field(result, DCSR_PRV, prv);
+  return result;
 }
 
 bool dcsr_csr_t::unlogged_write(const reg_t val) noexcept {

From d99efb545cca366a159dc1dedcbbd08fa2b3b8cf Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Wed, 24 May 2023 13:51:34 -0700
Subject: [PATCH 032/110] Implement dcsr.v and make DRET use it

Resolves #1365
---
 riscv/csrs.cc      | 8 +++++++-
 riscv/csrs.h       | 3 ++-
 riscv/insns/dret.h | 1 +
 riscv/processor.cc | 2 +-
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/riscv/csrs.cc b/riscv/csrs.cc
index 9a165eba3e..95b5e22348 100644
--- a/riscv/csrs.cc
+++ b/riscv/csrs.cc
@@ -13,6 +13,8 @@
 #include "trap.h"
 // For require():
 #include "insn_macros.h"
+// For CSR_DCSR_V:
+#include "debug_defines.h"
 
 // STATE macro used by require_privilege() macro:
 #undef STATE
@@ -1234,6 +1236,7 @@ dcsr_csr_t::dcsr_csr_t(processor_t* const proc, const reg_t addr):
   ebreaks(false),
   ebreaku(false),
   halt(false),
+  v(false),
   cause(0) {
 }
 
@@ -1255,6 +1258,7 @@ reg_t dcsr_csr_t::read() const noexcept {
   result = set_field(result, DCSR_CAUSE, cause);
   result = set_field(result, DCSR_STEP, step);
   result = set_field(result, DCSR_PRV, prv);
+  result = set_field(result, CSR_DCSR_V, v);
   return result;
 }
 
@@ -1267,12 +1271,14 @@ bool dcsr_csr_t::unlogged_write(const reg_t val) noexcept {
   ebreaks = get_field(val, DCSR_EBREAKS);
   ebreaku = get_field(val, DCSR_EBREAKU);
   halt = get_field(val, DCSR_HALT);
+  v = proc->extension_enabled('H') ? get_field(val, CSR_DCSR_V) : false;
   return true;
 }
 
-void dcsr_csr_t::write_cause_and_prv(uint8_t cause, reg_t prv) noexcept {
+void dcsr_csr_t::write_cause_and_prv(uint8_t cause, reg_t prv, bool v) noexcept {
   this->cause = cause;
   this->prv = prv;
+  this->v = v;
   log_write();
 }
 
diff --git a/riscv/csrs.h b/riscv/csrs.h
index 65be799320..19aefca139 100644
--- a/riscv/csrs.h
+++ b/riscv/csrs.h
@@ -656,7 +656,7 @@ class dcsr_csr_t: public csr_t {
   dcsr_csr_t(processor_t* const proc, const reg_t addr);
   virtual void verify_permissions(insn_t insn, bool write) const override;
   virtual reg_t read() const noexcept override;
-  void write_cause_and_prv(uint8_t cause, reg_t prv) noexcept;
+  void write_cause_and_prv(uint8_t cause, reg_t prv, bool v) noexcept;
  protected:
   virtual bool unlogged_write(const reg_t val) noexcept override;
  public:
@@ -667,6 +667,7 @@ class dcsr_csr_t: public csr_t {
   bool ebreaks;
   bool ebreaku;
   bool halt;
+  bool v;
   uint8_t cause;
 };
 
diff --git a/riscv/insns/dret.h b/riscv/insns/dret.h
index 56ce25bce6..0540c519a2 100644
--- a/riscv/insns/dret.h
+++ b/riscv/insns/dret.h
@@ -1,6 +1,7 @@
 require(STATE.debug_mode);
 set_pc_and_serialize(STATE.dpc->read());
 p->set_privilege(STATE.dcsr->prv);
+p->set_virt(STATE.dcsr->v);
 if (STATE.prv < PRV_M)
   STATE.mstatus->write(STATE.mstatus->read() & ~MSTATUS_MPRV);
 
diff --git a/riscv/processor.cc b/riscv/processor.cc
index 23284b8743..3d13f4e44a 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -763,7 +763,7 @@ void processor_t::set_virt(bool virt)
 void processor_t::enter_debug_mode(uint8_t cause)
 {
   state.debug_mode = true;
-  state.dcsr->write_cause_and_prv(cause, state.prv);
+  state.dcsr->write_cause_and_prv(cause, state.prv, state.v);
   set_privilege(PRV_M);
   state.dpc->write(state.pc);
   state.pc = DEBUG_ROM_ENTRY;

From e7f677b57fed9f753c16b48c255a7067424d6e69 Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Wed, 24 May 2023 15:25:09 -0700
Subject: [PATCH 033/110] Prevent possibility of V=1 and PRV=M when entering
 debug mode

---
 riscv/processor.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/riscv/processor.cc b/riscv/processor.cc
index 3d13f4e44a..36452aac17 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -764,6 +764,7 @@ void processor_t::enter_debug_mode(uint8_t cause)
 {
   state.debug_mode = true;
   state.dcsr->write_cause_and_prv(cause, state.prv, state.v);
+  set_virt(false);
   set_privilege(PRV_M);
   state.dpc->write(state.pc);
   state.pc = DEBUG_ROM_ENTRY;

From 4e509bfbc60e28eefef07f1c65aeca3b8d89615f Mon Sep 17 00:00:00 2001
From: Scott Johnson <scott.johnson@arilinc.com>
Date: Wed, 24 May 2023 14:57:21 -0700
Subject: [PATCH 034/110] Redo sret to put set_virt/set_privilege together

---
 riscv/insns/sret.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/riscv/insns/sret.h b/riscv/insns/sret.h
index 5102c15cc3..82c238bab2 100644
--- a/riscv/insns/sret.h
+++ b/riscv/insns/sret.h
@@ -14,14 +14,15 @@ s = set_field(s, MSTATUS_SIE, get_field(s, MSTATUS_SPIE));
 s = set_field(s, MSTATUS_SPIE, 1);
 s = set_field(s, MSTATUS_SPP, PRV_U);
 STATE.sstatus->write(s);
-p->set_privilege(prev_prv);
+bool prev_virt = STATE.v;
 if (!STATE.v) {
   if (p->extension_enabled('H')) {
-    reg_t prev_virt = get_field(prev_hstatus, HSTATUS_SPV);
-    p->set_virt(prev_virt);
+    prev_virt = get_field(prev_hstatus, HSTATUS_SPV);
     reg_t new_hstatus = set_field(prev_hstatus, HSTATUS_SPV, 0);
     STATE.hstatus->write(new_hstatus);
   }
 
   STATE.mstatus->write(set_field(STATE.mstatus->read(), MSTATUS_MPRV, 0));
 }
+p->set_privilege(prev_prv);
+p->set_virt(prev_virt);

From 5ab7691a49623ea419c3dd946720ffc92f43c686 Mon Sep 17 00:00:00 2001
From: Scott Johnson <scott.johnson@arilinc.com>
Date: Wed, 24 May 2023 15:04:13 -0700
Subject: [PATCH 035/110] Force V=1 when going to VS-mode trap handler

Should already be 1.
---
 riscv/processor.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/riscv/processor.cc b/riscv/processor.cc
index 36452aac17..7139069bce 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -832,6 +832,7 @@ void processor_t::take_trap(trap_t& t, reg_t epc)
     s = set_field(s, MSTATUS_SPP, state.prv);
     s = set_field(s, MSTATUS_SIE, 0);
     state.sstatus->write(s);
+    set_virt(true);
     set_privilege(PRV_S);
   } else if (state.prv <= PRV_S && bit < max_xlen && ((hsdeleg >> bit) & 1)) {
     // Handle the trap in HS-mode

From 505ddebefffed9f0e5b6dbf644719bc33adaa396 Mon Sep 17 00:00:00 2001
From: Scott Johnson <scott.johnson@arilinc.com>
Date: Wed, 24 May 2023 15:28:11 -0700
Subject: [PATCH 036/110] Explicitly use the nonvirtual S-mode CSRs when going
 to HS-mode

Since we're going to move the change to state->v next.
---
 riscv/processor.cc | 24 ++++++++++++------------
 riscv/processor.h  |  7 +++++++
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/riscv/processor.cc b/riscv/processor.cc
index 7139069bce..f0e8ebfebc 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -327,10 +327,10 @@ void state_t::reset(processor_t* const proc, reg_t max_isa)
   mcounteren = std::make_shared<masked_csr_t>(proc, CSR_MCOUNTEREN, counteren_mask, 0);
   if (proc->extension_enabled_const('U')) csrmap[CSR_MCOUNTEREN] = mcounteren;
   csrmap[CSR_SCOUNTEREN] = scounteren = std::make_shared<masked_csr_t>(proc, CSR_SCOUNTEREN, counteren_mask, 0);
-  auto nonvirtual_sepc = std::make_shared<epc_csr_t>(proc, CSR_SEPC);
+  nonvirtual_sepc = std::make_shared<epc_csr_t>(proc, CSR_SEPC);
   csrmap[CSR_VSEPC] = vsepc = std::make_shared<epc_csr_t>(proc, CSR_VSEPC);
   csrmap[CSR_SEPC] = sepc = std::make_shared<virtualized_csr_t>(proc, nonvirtual_sepc, vsepc);
-  auto nonvirtual_stval = std::make_shared<basic_csr_t>(proc, CSR_STVAL, 0);
+  nonvirtual_stval = std::make_shared<basic_csr_t>(proc, CSR_STVAL, 0);
   csrmap[CSR_VSTVAL] = vstval = std::make_shared<basic_csr_t>(proc, CSR_VSTVAL, 0);
   csrmap[CSR_STVAL] = stval = std::make_shared<virtualized_csr_t>(proc, nonvirtual_stval, vstval);
   auto sscratch = std::make_shared<basic_csr_t>(proc, CSR_SSCRATCH, 0);
@@ -338,13 +338,13 @@ void state_t::reset(processor_t* const proc, reg_t max_isa)
   // Note: if max_isa does not include H, we don't really need this virtualized_csr_t at all (though it doesn't hurt):
   csrmap[CSR_SSCRATCH] = std::make_shared<virtualized_csr_t>(proc, sscratch, vsscratch);
   csrmap[CSR_VSSCRATCH] = vsscratch;
-  auto nonvirtual_stvec = std::make_shared<tvec_csr_t>(proc, CSR_STVEC);
+  nonvirtual_stvec = std::make_shared<tvec_csr_t>(proc, CSR_STVEC);
   csrmap[CSR_VSTVEC] = vstvec = std::make_shared<tvec_csr_t>(proc, CSR_VSTVEC);
   csrmap[CSR_STVEC] = stvec = std::make_shared<virtualized_csr_t>(proc, nonvirtual_stvec, vstvec);
   auto nonvirtual_satp = std::make_shared<satp_csr_t>(proc, CSR_SATP);
   csrmap[CSR_VSATP] = vsatp = std::make_shared<base_atp_csr_t>(proc, CSR_VSATP);
   csrmap[CSR_SATP] = satp = std::make_shared<virtualized_satp_csr_t>(proc, nonvirtual_satp, vsatp);
-  auto nonvirtual_scause = std::make_shared<cause_csr_t>(proc, CSR_SCAUSE);
+  nonvirtual_scause = std::make_shared<cause_csr_t>(proc, CSR_SCAUSE);
   csrmap[CSR_VSCAUSE] = vscause = std::make_shared<cause_csr_t>(proc, CSR_VSCAUSE);
   csrmap[CSR_SCAUSE] = scause = std::make_shared<virtualized_csr_t>(proc, nonvirtual_scause, vscause);
   csrmap[CSR_MTVAL2] = mtval2 = std::make_shared<hypervisor_csr_t>(proc, CSR_MTVAL2);
@@ -382,7 +382,7 @@ void state_t::reset(processor_t* const proc, reg_t max_isa)
   csrmap[CSR_HTVAL] = htval = std::make_shared<basic_csr_t>(proc, CSR_HTVAL, 0);
   csrmap[CSR_HTINST] = htinst = std::make_shared<basic_csr_t>(proc, CSR_HTINST, 0);
   csrmap[CSR_HGATP] = hgatp = std::make_shared<hgatp_csr_t>(proc, CSR_HGATP);
-  auto nonvirtual_sstatus = std::make_shared<sstatus_proxy_csr_t>(proc, CSR_SSTATUS, mstatus);
+  nonvirtual_sstatus = std::make_shared<sstatus_proxy_csr_t>(proc, CSR_SSTATUS, mstatus);
   csrmap[CSR_VSSTATUS] = vsstatus = std::make_shared<vsstatus_csr_t>(proc, CSR_VSSTATUS);
   csrmap[CSR_SSTATUS] = sstatus = std::make_shared<sstatus_csr_t>(proc, nonvirtual_sstatus, vsstatus);
 
@@ -837,19 +837,19 @@ void processor_t::take_trap(trap_t& t, reg_t epc)
   } else if (state.prv <= PRV_S && bit < max_xlen && ((hsdeleg >> bit) & 1)) {
     // Handle the trap in HS-mode
     set_virt(false);
-    reg_t vector = (state.stvec->read() & 1) && interrupt ? 4 * bit : 0;
-    state.pc = (state.stvec->read() & ~(reg_t)1) + vector;
-    state.scause->write(t.cause());
-    state.sepc->write(epc);
-    state.stval->write(t.get_tval());
+    reg_t vector = (state.nonvirtual_stvec->read() & 1) && interrupt ? 4 * bit : 0;
+    state.pc = (state.nonvirtual_stvec->read() & ~(reg_t)1) + vector;
+    state.nonvirtual_scause->write(t.cause());
+    state.nonvirtual_sepc->write(epc);
+    state.nonvirtual_stval->write(t.get_tval());
     state.htval->write(t.get_tval2());
     state.htinst->write(t.get_tinst());
 
-    reg_t s = state.sstatus->read();
+    reg_t s = state.nonvirtual_sstatus->read();
     s = set_field(s, MSTATUS_SPIE, get_field(s, MSTATUS_SIE));
     s = set_field(s, MSTATUS_SPP, state.prv);
     s = set_field(s, MSTATUS_SIE, 0);
-    state.sstatus->write(s);
+    state.nonvirtual_sstatus->write(s);
     if (extension_enabled('H')) {
       s = state.hstatus->read();
       if (curr_virt)
diff --git a/riscv/processor.h b/riscv/processor.h
index 34354c22d7..2623014583 100644
--- a/riscv/processor.h
+++ b/riscv/processor.h
@@ -110,6 +110,13 @@ struct state_t
   virtualized_csr_t_p satp;
   csr_t_p scause;
 
+  // When taking a trap into HS-mode, we must access the nonvirtualized HS-mode CSRs directly:
+  csr_t_p nonvirtual_stvec;
+  csr_t_p nonvirtual_scause;
+  csr_t_p nonvirtual_sepc;
+  csr_t_p nonvirtual_stval;
+  sstatus_proxy_csr_t_p nonvirtual_sstatus;
+
   csr_t_p mtval2;
   csr_t_p mtinst;
   csr_t_p hstatus;

From 87bf9900832003ab2001158ca5d34e226e492df2 Mon Sep 17 00:00:00 2001
From: Scott Johnson <scott.johnson@arilinc.com>
Date: Wed, 24 May 2023 15:05:08 -0700
Subject: [PATCH 037/110] Move setting of V=0 for HS-mode trap

So it's right next to set_privilege() which it will be combined with
next.
---
 riscv/processor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/riscv/processor.cc b/riscv/processor.cc
index f0e8ebfebc..6fffc31f45 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -836,7 +836,6 @@ void processor_t::take_trap(trap_t& t, reg_t epc)
     set_privilege(PRV_S);
   } else if (state.prv <= PRV_S && bit < max_xlen && ((hsdeleg >> bit) & 1)) {
     // Handle the trap in HS-mode
-    set_virt(false);
     reg_t vector = (state.nonvirtual_stvec->read() & 1) && interrupt ? 4 * bit : 0;
     state.pc = (state.nonvirtual_stvec->read() & ~(reg_t)1) + vector;
     state.nonvirtual_scause->write(t.cause());
@@ -858,6 +857,7 @@ void processor_t::take_trap(trap_t& t, reg_t epc)
       s = set_field(s, HSTATUS_GVA, t.has_gva());
       state.hstatus->write(s);
     }
+    set_virt(false);
     set_privilege(PRV_S);
   } else {
     // Handle the trap in M-mode

From 0abf98f6f6d5b23b3d23f0c1af0b77b95a199bfe Mon Sep 17 00:00:00 2001
From: Scott Johnson <scott.johnson@arilinc.com>
Date: Wed, 24 May 2023 15:05:38 -0700
Subject: [PATCH 038/110] Move setting of V=0 for M-mode trap

So it's right next to set_privilege() which it will be combined with next.
---
 riscv/processor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/riscv/processor.cc b/riscv/processor.cc
index 6fffc31f45..4bcf41f423 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -861,7 +861,6 @@ void processor_t::take_trap(trap_t& t, reg_t epc)
     set_privilege(PRV_S);
   } else {
     // Handle the trap in M-mode
-    set_virt(false);
     const reg_t vector = (state.mtvec->read() & 1) && interrupt ? 4 * bit : 0;
     const reg_t trap_handler_address = (state.mtvec->read() & ~(reg_t)1) + vector;
     // RNMI exception vector is implementation-defined.  Since we don't model
@@ -883,6 +882,7 @@ void processor_t::take_trap(trap_t& t, reg_t epc)
     s = set_field(s, MSTATUS_GVA, t.has_gva());
     state.mstatus->write(s);
     if (state.mstatush) state.mstatush->write(s >> 32);  // log mstatush change
+    set_virt(false);
     set_privilege(PRV_M);
   }
 }

From 2c6b94e853cc1ee0c51a46859cf92451c0c19491 Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Wed, 24 May 2023 14:07:14 -0700
Subject: [PATCH 039/110] Refactor set_privilege to subsume set_virt

This cleans up the code and avoids bugs like #1365.
---
 riscv/insns/dret.h  |  3 +--
 riscv/insns/mnret.h |  3 +--
 riscv/insns/mret.h  |  3 +--
 riscv/insns/sret.h  |  3 +--
 riscv/processor.cc  | 35 +++++++----------------------------
 riscv/processor.h   |  3 +--
 6 files changed, 12 insertions(+), 38 deletions(-)

diff --git a/riscv/insns/dret.h b/riscv/insns/dret.h
index 0540c519a2..2abcc7d9ac 100644
--- a/riscv/insns/dret.h
+++ b/riscv/insns/dret.h
@@ -1,7 +1,6 @@
 require(STATE.debug_mode);
 set_pc_and_serialize(STATE.dpc->read());
-p->set_privilege(STATE.dcsr->prv);
-p->set_virt(STATE.dcsr->v);
+p->set_privilege(STATE.dcsr->prv, STATE.dcsr->v);
 if (STATE.prv < PRV_M)
   STATE.mstatus->write(STATE.mstatus->read() & ~MSTATUS_MPRV);
 
diff --git a/riscv/insns/mnret.h b/riscv/insns/mnret.h
index bc69510620..30f108188b 100644
--- a/riscv/insns/mnret.h
+++ b/riscv/insns/mnret.h
@@ -11,5 +11,4 @@ if (prev_prv != PRV_M) {
 }
 s = set_field(s, MNSTATUS_NMIE, 1);
 STATE.mnstatus->write(s);
-p->set_privilege(prev_prv);
-p->set_virt(prev_virt);
+p->set_privilege(prev_prv, prev_virt);
diff --git a/riscv/insns/mret.h b/riscv/insns/mret.h
index 5198b8fc64..f5f86a2b6b 100644
--- a/riscv/insns/mret.h
+++ b/riscv/insns/mret.h
@@ -10,5 +10,4 @@ s = set_field(s, MSTATUS_MPIE, 1);
 s = set_field(s, MSTATUS_MPP, p->extension_enabled('U') ? PRV_U : PRV_M);
 s = set_field(s, MSTATUS_MPV, 0);
 p->put_csr(CSR_MSTATUS, s);
-p->set_privilege(prev_prv);
-p->set_virt(prev_virt);
+p->set_privilege(prev_prv, prev_virt);
diff --git a/riscv/insns/sret.h b/riscv/insns/sret.h
index 82c238bab2..4c7305d098 100644
--- a/riscv/insns/sret.h
+++ b/riscv/insns/sret.h
@@ -24,5 +24,4 @@ if (!STATE.v) {
 
   STATE.mstatus->write(set_field(STATE.mstatus->read(), MSTATUS_MPRV, 0));
 }
-p->set_privilege(prev_prv);
-p->set_virt(prev_virt);
+p->set_privilege(prev_prv, prev_virt);
diff --git a/riscv/processor.cc b/riscv/processor.cc
index 4bcf41f423..ef985a18e4 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -714,11 +714,13 @@ reg_t processor_t::legalize_privilege(reg_t prv)
   return prv;
 }
 
-void processor_t::set_privilege(reg_t prv)
+void processor_t::set_privilege(reg_t prv, bool virt)
 {
   mmu->flush_tlb();
   state.prev_prv = state.prv;
+  state.prev_v = state.v;
   state.prv = legalize_privilege(prv);
+  state.v = virt && state.prv != PRV_M;
 }
 
 const char* processor_t::get_privilege_string()
@@ -741,31 +743,11 @@ const char* processor_t::get_privilege_string()
   abort();
 }
 
-void processor_t::set_virt(bool virt)
-{
-  reg_t tmp, mask;
-
-  if (state.prv == PRV_M)
-    return;
-
-  /*
-    * Ideally, we should flush TLB here but we don't need it because
-    * set_virt() is always used in conjucter with set_privilege() and
-    * set_privilege() will flush TLB unconditionally.
-    *
-    * The virtualized sstatus register also relies on this TLB flush,
-    * since changing V might change sstatus.MXR and sstatus.SUM.
-    */
-  state.prev_v = state.v;
-  state.v = virt;
-}
-
 void processor_t::enter_debug_mode(uint8_t cause)
 {
   state.debug_mode = true;
   state.dcsr->write_cause_and_prv(cause, state.prv, state.v);
-  set_virt(false);
-  set_privilege(PRV_M);
+  set_privilege(PRV_M, false);
   state.dpc->write(state.pc);
   state.pc = DEBUG_ROM_ENTRY;
   in_wfi = false;
@@ -832,8 +814,7 @@ void processor_t::take_trap(trap_t& t, reg_t epc)
     s = set_field(s, MSTATUS_SPP, state.prv);
     s = set_field(s, MSTATUS_SIE, 0);
     state.sstatus->write(s);
-    set_virt(true);
-    set_privilege(PRV_S);
+    set_privilege(PRV_S, true);
   } else if (state.prv <= PRV_S && bit < max_xlen && ((hsdeleg >> bit) & 1)) {
     // Handle the trap in HS-mode
     reg_t vector = (state.nonvirtual_stvec->read() & 1) && interrupt ? 4 * bit : 0;
@@ -857,8 +838,7 @@ void processor_t::take_trap(trap_t& t, reg_t epc)
       s = set_field(s, HSTATUS_GVA, t.has_gva());
       state.hstatus->write(s);
     }
-    set_virt(false);
-    set_privilege(PRV_S);
+    set_privilege(PRV_S, false);
   } else {
     // Handle the trap in M-mode
     const reg_t vector = (state.mtvec->read() & 1) && interrupt ? 4 * bit : 0;
@@ -882,8 +862,7 @@ void processor_t::take_trap(trap_t& t, reg_t epc)
     s = set_field(s, MSTATUS_GVA, t.has_gva());
     state.mstatus->write(s);
     if (state.mstatush) state.mstatush->write(s >> 32);  // log mstatush change
-    set_virt(false);
-    set_privilege(PRV_M);
+    set_privilege(PRV_M, false);
   }
 }
 
diff --git a/riscv/processor.h b/riscv/processor.h
index 2623014583..1b00808977 100644
--- a/riscv/processor.h
+++ b/riscv/processor.h
@@ -269,8 +269,7 @@ class processor_t : public abstract_device_t
       throw trap_instruction_address_misaligned(state.v, pc, 0, 0);
   }
   reg_t legalize_privilege(reg_t);
-  void set_privilege(reg_t);
-  void set_virt(bool);
+  void set_privilege(reg_t, bool);
   const char* get_privilege_string();
   void update_histogram(reg_t pc);
   const disassembler_t* get_disassembler() { return disassembler; }

From 4d3847f0ff293cbf82049bf4bd87bbc431b74347 Mon Sep 17 00:00:00 2001
From: Gianluca Guida <gianluca@rivosinc.com>
Date: Thu, 25 May 2023 14:27:00 +0100
Subject: [PATCH 040/110] decode_macros: move 'is_aligned' from
 'v_ext_macros.h'

---
 riscv/decode_macros.h | 4 ++++
 riscv/v_ext_macros.h  | 5 -----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/riscv/decode_macros.h b/riscv/decode_macros.h
index fee8ae7fae..439123594a 100644
--- a/riscv/decode_macros.h
+++ b/riscv/decode_macros.h
@@ -120,6 +120,10 @@ do { \
               if (rm > 4) throw trap_illegal_instruction(insn.bits()); \
               rm; })
 
+static inline bool is_aligned(const unsigned val, const unsigned pos)
+{
+  return pos ? (val & (pos - 1)) == 0 : true;
+}
 
 #define require_privilege(p) require(STATE.prv >= (p))
 #define require_novirt() (unlikely(STATE.v) ? throw trap_virtual_instruction(insn.bits()) : (void) 0)
diff --git a/riscv/v_ext_macros.h b/riscv/v_ext_macros.h
index e00b0c0a3e..8b0d0fde11 100644
--- a/riscv/v_ext_macros.h
+++ b/riscv/v_ext_macros.h
@@ -64,11 +64,6 @@ static inline bool is_overlapped_widen(const int astart, int asize,
   }
 }
 
-static inline bool is_aligned(const unsigned val, const unsigned pos)
-{
-  return pos ? (val & (pos - 1)) == 0 : true;
-}
-
 #define VI_NARROW_CHECK_COMMON \
   require_vector(true); \
   require(P.VU.vflmul <= 4); \

From b3a3063b57644bde79703febf4d87b52519ab663 Mon Sep 17 00:00:00 2001
From: Gianluca Guida <gianluca@rivosinc.com>
Date: Thu, 25 May 2023 13:09:30 +0100
Subject: [PATCH 041/110] configure.h: Add HAVE_INT128

Modify configure.ac to generate a header that expose the support for
128-bit integers.
---
 config.h.in  | 3 +++
 configure    | 3 +++
 configure.ac | 4 +++-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/config.h.in b/config.h.in
index f6755a1dd8..95514fddbc 100644
--- a/config.h.in
+++ b/config.h.in
@@ -39,6 +39,9 @@
 /* Dynamic library loading is supported */
 #undef HAVE_DLOPEN
 
+/* __int128_t is supported */
+#undef HAVE_INT128
+
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H
 
diff --git a/configure b/configure
index 93a4302e90..73e5203089 100755
--- a/configure
+++ b/configure
@@ -4742,6 +4742,9 @@ ac_fn_cxx_check_type "$LINENO" "__int128_t" "ac_cv_type___int128_t" "$ac_include
 if test "x$ac_cv_type___int128_t" = xyes; then :
   HAVE_INT128=yes
 
+
+$as_echo "#define HAVE_INT128 1" >>confdefs.h
+
 fi
 
 
diff --git a/configure.ac b/configure.ac
index 1b46578b00..701bd99481 100644
--- a/configure.ac
+++ b/configure.ac
@@ -85,7 +85,9 @@ AC_HEADER_STDC
 # Checks for type
 #-------------------------------------------------------------------------
 
-AC_CHECK_TYPE([__int128_t], AC_SUBST([HAVE_INT128],[yes]))
+AC_CHECK_TYPE([__int128_t],
+              [AC_SUBST([HAVE_INT128],[yes])
+              AC_DEFINE([HAVE_INT128], [1], [__int128_t is supported])])
 
 #-------------------------------------------------------------------------
 # Default compiler flags

From 3010cb4175c5b3f485e6ce7e749afa0667f71dbe Mon Sep 17 00:00:00 2001
From: Gianluca Guida <gianluca@rivosinc.com>
Date: Thu, 25 May 2023 14:39:01 +0100
Subject: [PATCH 042/110] Use HAVE_INT128 instead of __SIZEOF_INT128__

Make sure that the configure decision on 128-bit is consistent during
compilation.

Also move uint128_t definition.
---
 riscv/decode.h        | 5 -----
 riscv/decode_macros.h | 5 +++++
 riscv/processor.cc    | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/riscv/decode.h b/riscv/decode.h
index a55b06946b..dad32a1e31 100644
--- a/riscv/decode.h
+++ b/riscv/decode.h
@@ -19,11 +19,6 @@ typedef int64_t sreg_t;
 typedef uint64_t reg_t;
 typedef float128_t freg_t;
 
-#ifdef __SIZEOF_INT128__
-typedef __int128 int128_t;
-typedef unsigned __int128 uint128_t;
-#endif
-
 const int NXPR = 32;
 const int NFPR = 32;
 const int NVPR = 32;
diff --git a/riscv/decode_macros.h b/riscv/decode_macros.h
index fee8ae7fae..6dcf4fa46e 100644
--- a/riscv/decode_macros.h
+++ b/riscv/decode_macros.h
@@ -10,6 +10,11 @@
 #include "softfloat_types.h"
 #include "specialize.h"
 
+#ifdef HAVE_INT128
+typedef __int128 int128_t;
+typedef unsigned __int128 uint128_t;
+#endif
+
 // helpful macros, etc
 #define MMU (*p->get_mmu())
 #define STATE (*p->get_state())
diff --git a/riscv/processor.cc b/riscv/processor.cc
index ef985a18e4..e5ae60b4fd 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -42,7 +42,7 @@ processor_t::processor_t(const isa_parser_t *isa, const cfg_t *cfg,
   VU.p = this;
   TM.proc = this;
 
-#ifndef __SIZEOF_INT128__
+#ifndef HAVE_INT128
   if (extension_enabled('V')) {
     fprintf(stderr, "V extension is not supported on platforms without __int128 type\n");
     abort();

From c231e0c9f6dcf3e9fe0637724d8672560d62c58e Mon Sep 17 00:00:00 2001
From: Gianluca Guida <gianluca@rivosinc.com>
Date: Thu, 25 May 2023 14:43:57 +0100
Subject: [PATCH 043/110] Fix check for extension

Calling 'extension_enabled' this early during the constructor of
'processor_t' causes SIGSEGV.
---
 riscv/processor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/riscv/processor.cc b/riscv/processor.cc
index e5ae60b4fd..a75b0ff6f1 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -43,7 +43,7 @@ processor_t::processor_t(const isa_parser_t *isa, const cfg_t *cfg,
   TM.proc = this;
 
 #ifndef HAVE_INT128
-  if (extension_enabled('V')) {
+  if (isa->extension_enabled('V')) {
     fprintf(stderr, "V extension is not supported on platforms without __int128 type\n");
     abort();
   }

From afe3987685f11058b28988ac9d7e484368246937 Mon Sep 17 00:00:00 2001
From: Weiwei Li <liweiwei@iscas.ac.cn>
Date: Fri, 14 Apr 2023 22:35:12 +0800
Subject: [PATCH 044/110] Add convertion function between binary float16 and
 float32 in softfloat

---
 softfloat/bf16_to_f32.c       |  80 ++++++++++++++++++++++++
 softfloat/f32_to_bf16.c       |  92 +++++++++++++++++++++++++++
 softfloat/internals.h         |   6 ++
 softfloat/s_roundPackToBF16.c | 113 ++++++++++++++++++++++++++++++++++
 softfloat/softfloat.h         |   2 +
 softfloat/softfloat.mk.in     |   3 +
 softfloat/softfloat_types.h   |   1 +
 softfloat/specialize.h        |  19 ++++++
 8 files changed, 316 insertions(+)
 create mode 100644 softfloat/bf16_to_f32.c
 create mode 100644 softfloat/f32_to_bf16.c
 create mode 100644 softfloat/s_roundPackToBF16.c

diff --git a/softfloat/bf16_to_f32.c b/softfloat/bf16_to_f32.c
new file mode 100644
index 0000000000..7e49002915
--- /dev/null
+++ b/softfloat/bf16_to_f32.c
@@ -0,0 +1,80 @@
+
+/*============================================================================
+
+This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3d, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the
+University of California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "platform.h"
+#include "internals.h"
+#include "specialize.h"
+#include "softfloat.h"
+
+float32_t bf16_to_f32( bfloat16_t a )
+{
+    union ui16_f16 uA;
+    uint_fast16_t uiA;
+    bool sign;
+    int_fast16_t exp;
+    uint_fast16_t frac;
+    struct commonNaN commonNaN;
+    uint_fast32_t uiZ;
+    union ui32_f32 uZ;
+
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+    uA.f = a;
+    uiA = uA.ui;
+    sign = signBF16UI( uiA );
+    exp  = expBF16UI( uiA );
+    frac = fracBF16UI( uiA );
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+    if ( exp == 0xFF ) {
+        if ( frac ) {
+            softfloat_bf16UIToCommonNaN( uiA, &commonNaN );
+            uiZ = softfloat_commonNaNToF32UI( &commonNaN );
+        } else {
+            uiZ = packToF32UI( sign, 0xFF, 0 );
+        }
+        goto uiZ;
+    }
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+    uiZ = packToF32UI( sign, exp, (uint_fast32_t) frac<<16 );
+ uiZ:
+    uZ.ui = uiZ;
+    return uZ.f;
+
+}
diff --git a/softfloat/f32_to_bf16.c b/softfloat/f32_to_bf16.c
new file mode 100644
index 0000000000..92a2e6d850
--- /dev/null
+++ b/softfloat/f32_to_bf16.c
@@ -0,0 +1,92 @@
+
+/*============================================================================
+
+This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3d, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of
+California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "platform.h"
+#include "internals.h"
+#include "specialize.h"
+#include "softfloat.h"
+
+bfloat16_t f32_to_bf16( float32_t a )
+{
+    union ui32_f32 uA;
+    uint_fast32_t uiA;
+    bool sign;
+    int_fast16_t exp;
+    uint_fast32_t frac;
+    struct commonNaN commonNaN;
+    struct exp16_sig32 normExpSig;
+    uint_fast16_t uiZ, frac16;
+    union ui16_f16 uZ;
+
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+    uA.f = a;
+    uiA = uA.ui;
+    sign = signF32UI( uiA );
+    exp  = expF32UI( uiA );
+    frac = fracF32UI( uiA );
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+    if ( exp == 0xFF ) {
+        if ( frac ) {
+            softfloat_f32UIToCommonNaN( uiA, &commonNaN );
+            uiZ = softfloat_commonNaNToBF16UI( &commonNaN );
+        } else {
+            uiZ = packToBF16UI( sign, 0xFF, 0 );
+        }
+        goto uiZ;
+    }
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+    if ( ! (exp | frac) ) {
+        uiZ = packToBF16UI( sign, 0, 0 );
+        goto uiZ;
+    } else if ( !exp ) {
+        normExpSig = softfloat_normSubnormalF32Sig( frac );
+        exp = normExpSig.exp;
+        frac = normExpSig.sig;
+    }
+    frac16 = frac>>9 | ((frac & 0x1FF) != 0);
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+    return softfloat_roundPackToBF16( sign, exp - 1, frac16 | 0x4000 );
+ uiZ:
+    uZ.ui = uiZ;
+    return uZ.f;
+
+}
diff --git a/softfloat/internals.h b/softfloat/internals.h
index 55585e967e..ae94427f07 100644
--- a/softfloat/internals.h
+++ b/softfloat/internals.h
@@ -89,6 +89,11 @@ int_fast64_t softfloat_roundMToI64( bool, uint32_t *, uint_fast8_t, bool );
 #define fracF16UI( a ) ((a) & 0x03FF)
 #define packToF16UI( sign, exp, sig ) (((uint16_t) (sign)<<15) + ((uint16_t) (exp)<<10) + (sig))
 
+#define signBF16UI( a ) ((bool) ((uint16_t) (a)>>15))
+#define expBF16UI( a ) ((int_fast16_t) ((a)>>7) & 0xFF)
+#define fracBF16UI( a ) ((a) & 0x07F)
+#define packToBF16UI( sign, exp, sig ) (((uint16_t) (sign)<<15) + ((uint16_t) (exp)<<7) + (sig))
+
 #define isNaNF16UI( a ) (((~(a) & 0x7C00) == 0) && ((a) & 0x03FF))
 
 struct exp8_sig16 { int_fast8_t exp; uint_fast16_t sig; };
@@ -103,6 +108,7 @@ float16_t
  softfloat_mulAddF16(
      uint_fast16_t, uint_fast16_t, uint_fast16_t, uint_fast8_t );
 
+bfloat16_t softfloat_roundPackToBF16( bool, int_fast16_t, uint_fast16_t );
 /*----------------------------------------------------------------------------
 *----------------------------------------------------------------------------*/
 #define signF32UI( a ) ((bool) ((uint32_t) (a)>>31))
diff --git a/softfloat/s_roundPackToBF16.c b/softfloat/s_roundPackToBF16.c
new file mode 100644
index 0000000000..f3d0b75f43
--- /dev/null
+++ b/softfloat/s_roundPackToBF16.c
@@ -0,0 +1,113 @@
+
+/*============================================================================
+
+This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3d, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015, 2017 The Regents of the University of
+California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "platform.h"
+#include "internals.h"
+#include "softfloat.h"
+
+bfloat16_t
+ softfloat_roundPackToBF16( bool sign, int_fast16_t exp, uint_fast16_t sig )
+{
+    uint_fast8_t roundingMode;
+    bool roundNearEven;
+    uint_fast8_t roundIncrement, roundBits;
+    bool isTiny;
+    uint_fast16_t uiZ;
+    union ui16_f16 uZ;
+
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+    roundingMode = softfloat_roundingMode;
+    roundNearEven = (roundingMode == softfloat_round_near_even);
+    roundIncrement = 0x40;
+    if ( ! roundNearEven && (roundingMode != softfloat_round_near_maxMag) ) {
+        roundIncrement =
+            (roundingMode
+                 == (sign ? softfloat_round_min : softfloat_round_max))
+                ? 0x7F
+                : 0;
+    }
+    roundBits = sig & 0x7F;
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+    if ( 0xFD <= (unsigned int) exp ) {
+        if ( exp < 0 ) {
+            /*----------------------------------------------------------------
+            *----------------------------------------------------------------*/
+            isTiny =
+                (softfloat_detectTininess == softfloat_tininess_beforeRounding)
+                    || (exp < -1) || (sig + roundIncrement < 0x8000);
+            sig = softfloat_shiftRightJam32( sig, -exp );
+            exp = 0;
+            roundBits = sig & 0x7F;
+            if ( isTiny && roundBits ) {
+                softfloat_raiseFlags( softfloat_flag_underflow );
+            }
+        } else if ( (0xFD < exp) || (0x8000 <= sig + roundIncrement) ) {
+            /*----------------------------------------------------------------
+            *----------------------------------------------------------------*/
+            softfloat_raiseFlags(
+                softfloat_flag_overflow | softfloat_flag_inexact );
+            uiZ = packToBF16UI( sign, 0xFF, 0 ) - ! roundIncrement;
+            goto uiZ;
+        }
+    }
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+    sig = (sig + roundIncrement)>>7;
+    if ( roundBits ) {
+        softfloat_exceptionFlags |= softfloat_flag_inexact;
+#ifdef SOFTFLOAT_ROUND_ODD
+        if ( roundingMode == softfloat_round_odd ) {
+            sig |= 1;
+            goto packReturn;
+        }
+#endif
+    }
+    sig &= ~(uint_fast16_t) (! (roundBits ^ 0x40) & roundNearEven);
+    if ( ! sig ) exp = 0;
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+ packReturn:
+    uiZ = packToBF16UI( sign, exp, sig );
+ uiZ:
+    uZ.ui = uiZ;
+    return uZ.f;
+
+}
+
diff --git a/softfloat/softfloat.h b/softfloat/softfloat.h
index bdac1be263..eb78d74de7 100644
--- a/softfloat/softfloat.h
+++ b/softfloat/softfloat.h
@@ -154,6 +154,7 @@ uint_fast64_t f16_to_ui64_r_minMag( float16_t, bool );
 int_fast32_t f16_to_i32_r_minMag( float16_t, bool );
 int_fast64_t f16_to_i64_r_minMag( float16_t, bool );
 float32_t f16_to_f32( float16_t );
+float32_t bf16_to_f32( bfloat16_t );
 float64_t f16_to_f64( float16_t );
 #ifdef SOFTFLOAT_FAST_INT64
 extFloat80_t f16_to_extF80( float16_t );
@@ -196,6 +197,7 @@ uint_fast64_t f32_to_ui64_r_minMag( float32_t, bool );
 int_fast32_t f32_to_i32_r_minMag( float32_t, bool );
 int_fast64_t f32_to_i64_r_minMag( float32_t, bool );
 float16_t f32_to_f16( float32_t );
+bfloat16_t f32_to_bf16( float32_t );
 float64_t f32_to_f64( float32_t );
 #ifdef SOFTFLOAT_FAST_INT64
 extFloat80_t f32_to_extF80( float32_t );
diff --git a/softfloat/softfloat.mk.in b/softfloat/softfloat.mk.in
index e7f4a3e415..9c780ac751 100644
--- a/softfloat/softfloat.mk.in
+++ b/softfloat/softfloat.mk.in
@@ -45,6 +45,7 @@ softfloat_c_srcs = \
 	f16_sqrt.c \
 	f16_sub.c \
 	f16_to_f128.c \
+	bf16_to_f32.c \
 	f16_to_f32.c \
 	f16_to_f64.c \
 	f16_to_i8.c \
@@ -76,6 +77,7 @@ softfloat_c_srcs = \
 	f32_sqrt.c \
 	f32_sub.c \
 	f32_to_f128.c \
+	f32_to_bf16.c \
 	f32_to_f16.c \
 	f32_to_f64.c \
 	f32_to_i16.c \
@@ -181,6 +183,7 @@ softfloat_c_srcs = \
 	s_roundMToUI64.c \
 	s_roundPackMToI64.c \
 	s_roundPackMToUI64.c \
+	s_roundPackToBF16.c \
 	s_roundPackToF128.c \
 	s_roundPackToF16.c \
 	s_roundPackToF32.c \
diff --git a/softfloat/softfloat_types.h b/softfloat/softfloat_types.h
index af1888f9b9..34c518f438 100644
--- a/softfloat/softfloat_types.h
+++ b/softfloat/softfloat_types.h
@@ -48,6 +48,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | (typically 'float' and 'double', and possibly 'long double').
 *----------------------------------------------------------------------------*/
 typedef struct { uint16_t v; } float16_t;
+typedef float16_t bfloat16_t;
 typedef struct { uint32_t v; } float32_t;
 typedef struct { uint64_t v; } float64_t;
 typedef struct { uint64_t v[2]; } float128_t;
diff --git a/softfloat/specialize.h b/softfloat/specialize.h
index 19504b6b62..fb3761d7c7 100644
--- a/softfloat/specialize.h
+++ b/softfloat/specialize.h
@@ -98,6 +98,11 @@ struct commonNaN { char _unused; };
 *----------------------------------------------------------------------------*/
 #define defaultNaNF16UI 0x7E00
 
+/*----------------------------------------------------------------------------
+| The bit pattern for a default generated binary 16-bit floating-point NaN.
+*----------------------------------------------------------------------------*/
+#define defaultNaNBF16UI 0x7FC0
+
 /*----------------------------------------------------------------------------
 | Returns true when 16-bit unsigned integer `uiA' has the bit pattern of a
 | 16-bit floating-point signaling NaN.
@@ -113,6 +118,20 @@ struct commonNaN { char _unused; };
 *----------------------------------------------------------------------------*/
 #define softfloat_f16UIToCommonNaN( uiA, zPtr ) if ( ! ((uiA) & 0x0200) ) (void) (zPtr), softfloat_raiseFlags( softfloat_flag_invalid )
 
+/*----------------------------------------------------------------------------
+| Assuming `uiA' has the bit pattern of a binary 16-bit floating-point NaN, converts
+| this NaN to the common NaN form, and stores the resulting common NaN at the
+| location pointed to by `zPtr'.  If the NaN is a signaling NaN, the invalid
+| exception is raised.
+*----------------------------------------------------------------------------*/
+#define softfloat_bf16UIToCommonNaN( uiA, zPtr ) if ( ! ((uiA) & 0x040) ) (void) (zPtr), softfloat_raiseFlags( softfloat_flag_invalid )
+
+/*----------------------------------------------------------------------------
+| Converts the common NaN pointed to by `aPtr' into a binary 16-bit floating-point
+| NaN, and returns the bit pattern of this value as an unsigned integer.
+*----------------------------------------------------------------------------*/
+#define softfloat_commonNaNToBF16UI( aPtr ) ((uint_fast16_t) defaultNaNBF16UI)
+
 /*----------------------------------------------------------------------------
 | Converts the common NaN pointed to by `aPtr' into a 16-bit floating-point
 | NaN, and returns the bit pattern of this value as an unsigned integer.

From fecdad2b62cbc4a6baa51d301f0a49d89b81d675 Mon Sep 17 00:00:00 2001
From: Weiwei Li <liweiwei@iscas.ac.cn>
Date: Fri, 14 Apr 2023 22:36:39 +0800
Subject: [PATCH 045/110] Add isa string support for Zfbfmin/Zvfbfmin/Zvfbfwma

---
 riscv/isa_parser.cc | 14 ++++++++++++++
 riscv/isa_parser.h  |  3 +++
 2 files changed, 17 insertions(+)

diff --git a/riscv/isa_parser.cc b/riscv/isa_parser.cc
index 7335a147e4..be5e51b7be 100644
--- a/riscv/isa_parser.cc
+++ b/riscv/isa_parser.cc
@@ -139,6 +139,8 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
     } else if (ext_str == "zdinx") {
       extension_table[EXT_ZFINX] = true;
       extension_table[EXT_ZDINX] = true;
+    } else if (ext_str == "zfbfmin") {
+      extension_table[EXT_ZFBFMIN] = true;
     } else if (ext_str == "zfinx") {
       extension_table[EXT_ZFINX] = true;
     } else if (ext_str == "zhinx") {
@@ -232,6 +234,10 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
       extension_table[EXT_ZICOND] = true;
     } else if (ext_str == "zihpm") {
       extension_table[EXT_ZIHPM] = true;
+    } else if (ext_str == "zvfbfmin") {
+      extension_table[EXT_ZVFBFMIN] = true;
+    } else if (ext_str == "zvfbfwma") {
+      extension_table[EXT_ZVFBFWMA] = true;
     } else if (ext_str == "sstc") {
         extension_table[EXT_SSTC] = true;
     } else if (ext_str[0] == 'x') {
@@ -279,6 +285,14 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
     bad_isa_string(str, ("can't parse: " + std::string(p)).c_str());
   }
 
+  if (extension_table[EXT_ZFBFMIN] && !extension_table['F']) {
+    bad_isa_string(str, "'Zfbfmin' extension requires 'F' extension");
+  }
+
+  if ((extension_table[EXT_ZVFBFMIN] || extension_table[EXT_ZVFBFWMA]) && !extension_table['V']) {
+    bad_isa_string(str, "'Zvfbfmin/Zvfbfwma' extension requires 'V' extension");
+  }
+
   if (extension_table['C']) {
     extension_table[EXT_ZCA] = true;
     if (extension_table['F'] && max_xlen == 32)
diff --git a/riscv/isa_parser.h b/riscv/isa_parser.h
index 9effd164d8..04859b6ec4 100644
--- a/riscv/isa_parser.h
+++ b/riscv/isa_parser.h
@@ -49,6 +49,7 @@ typedef enum {
   EXT_SVINVAL,
   EXT_ZDINX,
   EXT_ZFA,
+  EXT_ZFBFMIN,
   EXT_ZFINX,
   EXT_ZHINX,
   EXT_ZHINXMIN,
@@ -57,6 +58,8 @@ typedef enum {
   EXT_ZICNTR,
   EXT_ZICOND,
   EXT_ZIHPM,
+  EXT_ZVFBFMIN,
+  EXT_ZVFBFWMA,
   EXT_XZBP,
   EXT_XZBS,
   EXT_XZBE,

From c12d0782173ba00531bd48f653238d81cb9c3484 Mon Sep 17 00:00:00 2001
From: Weiwei Li <liweiwei@iscas.ac.cn>
Date: Fri, 14 Apr 2023 22:38:49 +0800
Subject: [PATCH 046/110] Update encoding.h to add instructions for BF16
 extensions

---
 riscv/encoding.h | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/riscv/encoding.h b/riscv/encoding.h
index 48cb5c0058..e39f535ceb 100644
--- a/riscv/encoding.h
+++ b/riscv/encoding.h
@@ -4,7 +4,7 @@
 
 /*
  * This file is auto-generated by running 'make' in
- * https://github.com/riscv/riscv-opcodes (5adef50)
+ * https://github.com/riscv/riscv-opcodes (8d70e77)
  */
 
 #ifndef RISCV_CSR_ENCODING_H
@@ -751,6 +751,8 @@
 #define MASK_FCLASS_Q 0xfff0707f
 #define MATCH_FCLASS_S 0xe0001053
 #define MASK_FCLASS_S 0xfff0707f
+#define MATCH_FCVT_BF16_S 0x44800053
+#define MASK_FCVT_BF16_S 0xfff0007f
 #define MATCH_FCVT_D_H 0x42200053
 #define MASK_FCVT_D_H 0xfff0007f
 #define MATCH_FCVT_D_L 0xd2200053
@@ -809,6 +811,8 @@
 #define MASK_FCVT_Q_W 0xfff0007f
 #define MATCH_FCVT_Q_WU 0xd6100053
 #define MASK_FCVT_Q_WU 0xfff0007f
+#define MATCH_FCVT_S_BF16 0x40600053
+#define MASK_FCVT_S_BF16 0xfff0007f
 #define MATCH_FCVT_S_D 0x40100053
 #define MASK_FCVT_S_D 0xfff0007f
 #define MATCH_FCVT_S_H 0x40200053
@@ -2165,6 +2169,8 @@
 #define MASK_VFNCVT_X_F_W 0xfc0ff07f
 #define MATCH_VFNCVT_XU_F_W 0x48081057
 #define MASK_VFNCVT_XU_F_W 0xfc0ff07f
+#define MATCH_VFNCVTBF16_F_F_W 0x480e9057
+#define MASK_VFNCVTBF16_F_F_W 0xfc0ff07f
 #define MATCH_VFNMACC_VF 0xb4005057
 #define MASK_VFNMACC_VF 0xfc00707f
 #define MATCH_VFNMACC_VV 0xb4001057
@@ -2241,10 +2247,16 @@
 #define MASK_VFWCVT_X_F_V 0xfc0ff07f
 #define MATCH_VFWCVT_XU_F_V 0x48041057
 #define MASK_VFWCVT_XU_F_V 0xfc0ff07f
+#define MATCH_VFWCVTBF16_F_F_V 0x48069057
+#define MASK_VFWCVTBF16_F_F_V 0xfc0ff07f
 #define MATCH_VFWMACC_VF 0xf0005057
 #define MASK_VFWMACC_VF 0xfc00707f
 #define MATCH_VFWMACC_VV 0xf0001057
 #define MASK_VFWMACC_VV 0xfc00707f
+#define MATCH_VFWMACCBF16_VF 0xec005057
+#define MASK_VFWMACCBF16_VF 0xfc00707f
+#define MATCH_VFWMACCBF16_VV 0xec001057
+#define MASK_VFWMACCBF16_VV 0xfc00707f
 #define MATCH_VFWMSAC_VF 0xf8005057
 #define MASK_VFWMSAC_VF 0xfc00707f
 #define MATCH_VFWMSAC_VV 0xf8001057
@@ -3392,8 +3404,11 @@
 #define INSN_FIELD_AMOOP 0xf8000000
 #define INSN_FIELD_NF 0xe0000000
 #define INSN_FIELD_SIMM5 0xf8000
+#define INSN_FIELD_ZIMM5 0xf8000
 #define INSN_FIELD_ZIMM10 0x3ff00000
 #define INSN_FIELD_ZIMM11 0x7ff00000
+#define INSN_FIELD_ZIMM6HI 0x4000000
+#define INSN_FIELD_ZIMM6LO 0xf8000
 #define INSN_FIELD_C_NZUIMM10 0x1fe0
 #define INSN_FIELD_C_UIMM7LO 0x60
 #define INSN_FIELD_C_UIMM7HI 0x1c00
@@ -3636,6 +3651,7 @@ DECLARE_INSN(fclass_d, MATCH_FCLASS_D, MASK_FCLASS_D)
 DECLARE_INSN(fclass_h, MATCH_FCLASS_H, MASK_FCLASS_H)
 DECLARE_INSN(fclass_q, MATCH_FCLASS_Q, MASK_FCLASS_Q)
 DECLARE_INSN(fclass_s, MATCH_FCLASS_S, MASK_FCLASS_S)
+DECLARE_INSN(fcvt_bf16_s, MATCH_FCVT_BF16_S, MASK_FCVT_BF16_S)
 DECLARE_INSN(fcvt_d_h, MATCH_FCVT_D_H, MASK_FCVT_D_H)
 DECLARE_INSN(fcvt_d_l, MATCH_FCVT_D_L, MASK_FCVT_D_L)
 DECLARE_INSN(fcvt_d_lu, MATCH_FCVT_D_LU, MASK_FCVT_D_LU)
@@ -3665,6 +3681,7 @@ DECLARE_INSN(fcvt_q_lu, MATCH_FCVT_Q_LU, MASK_FCVT_Q_LU)
 DECLARE_INSN(fcvt_q_s, MATCH_FCVT_Q_S, MASK_FCVT_Q_S)
 DECLARE_INSN(fcvt_q_w, MATCH_FCVT_Q_W, MASK_FCVT_Q_W)
 DECLARE_INSN(fcvt_q_wu, MATCH_FCVT_Q_WU, MASK_FCVT_Q_WU)
+DECLARE_INSN(fcvt_s_bf16, MATCH_FCVT_S_BF16, MASK_FCVT_S_BF16)
 DECLARE_INSN(fcvt_s_d, MATCH_FCVT_S_D, MASK_FCVT_S_D)
 DECLARE_INSN(fcvt_s_h, MATCH_FCVT_S_H, MASK_FCVT_S_H)
 DECLARE_INSN(fcvt_s_l, MATCH_FCVT_S_L, MASK_FCVT_S_L)
@@ -4343,6 +4360,7 @@ DECLARE_INSN(vfncvt_rtz_x_f_w, MATCH_VFNCVT_RTZ_X_F_W, MASK_VFNCVT_RTZ_X_F_W)
 DECLARE_INSN(vfncvt_rtz_xu_f_w, MATCH_VFNCVT_RTZ_XU_F_W, MASK_VFNCVT_RTZ_XU_F_W)
 DECLARE_INSN(vfncvt_x_f_w, MATCH_VFNCVT_X_F_W, MASK_VFNCVT_X_F_W)
 DECLARE_INSN(vfncvt_xu_f_w, MATCH_VFNCVT_XU_F_W, MASK_VFNCVT_XU_F_W)
+DECLARE_INSN(vfncvtbf16_f_f_w, MATCH_VFNCVTBF16_F_F_W, MASK_VFNCVTBF16_F_F_W)
 DECLARE_INSN(vfnmacc_vf, MATCH_VFNMACC_VF, MASK_VFNMACC_VF)
 DECLARE_INSN(vfnmacc_vv, MATCH_VFNMACC_VV, MASK_VFNMACC_VV)
 DECLARE_INSN(vfnmadd_vf, MATCH_VFNMADD_VF, MASK_VFNMADD_VF)
@@ -4381,8 +4399,11 @@ DECLARE_INSN(vfwcvt_rtz_x_f_v, MATCH_VFWCVT_RTZ_X_F_V, MASK_VFWCVT_RTZ_X_F_V)
 DECLARE_INSN(vfwcvt_rtz_xu_f_v, MATCH_VFWCVT_RTZ_XU_F_V, MASK_VFWCVT_RTZ_XU_F_V)
 DECLARE_INSN(vfwcvt_x_f_v, MATCH_VFWCVT_X_F_V, MASK_VFWCVT_X_F_V)
 DECLARE_INSN(vfwcvt_xu_f_v, MATCH_VFWCVT_XU_F_V, MASK_VFWCVT_XU_F_V)
+DECLARE_INSN(vfwcvtbf16_f_f_v, MATCH_VFWCVTBF16_F_F_V, MASK_VFWCVTBF16_F_F_V)
 DECLARE_INSN(vfwmacc_vf, MATCH_VFWMACC_VF, MASK_VFWMACC_VF)
 DECLARE_INSN(vfwmacc_vv, MATCH_VFWMACC_VV, MASK_VFWMACC_VV)
+DECLARE_INSN(vfwmaccbf16_vf, MATCH_VFWMACCBF16_VF, MASK_VFWMACCBF16_VF)
+DECLARE_INSN(vfwmaccbf16_vv, MATCH_VFWMACCBF16_VV, MASK_VFWMACCBF16_VV)
 DECLARE_INSN(vfwmsac_vf, MATCH_VFWMSAC_VF, MASK_VFWMSAC_VF)
 DECLARE_INSN(vfwmsac_vv, MATCH_VFWMSAC_VV, MASK_VFWMSAC_VV)
 DECLARE_INSN(vfwmul_vf, MATCH_VFWMUL_VF, MASK_VFWMUL_VF)

From 40dce7899b7a42d06413071c542606d4c0249174 Mon Sep 17 00:00:00 2001
From: Weiwei Li <liweiwei@iscas.ac.cn>
Date: Fri, 14 Apr 2023 22:44:31 +0800
Subject: [PATCH 047/110] Add support for new instructions of Zfbfmin extension

---
 riscv/decode_macros.h     |  2 ++
 riscv/insns/fcvt_bf16_s.h |  5 +++++
 riscv/insns/fcvt_s_bf16.h |  5 +++++
 riscv/riscv.mk.in         | 12 ++++++++++--
 4 files changed, 22 insertions(+), 2 deletions(-)
 create mode 100644 riscv/insns/fcvt_bf16_s.h
 create mode 100644 riscv/insns/fcvt_s_bf16.h

diff --git a/riscv/decode_macros.h b/riscv/decode_macros.h
index 6bdd574989..7ba132c196 100644
--- a/riscv/decode_macros.h
+++ b/riscv/decode_macros.h
@@ -74,6 +74,7 @@ typedef unsigned __int128 uint128_t;
 #define FRS2 READ_FREG(insn.rs2())
 #define FRS3 READ_FREG(insn.rs3())
 #define FRS1_H READ_FREG_H(insn.rs1())
+#define FRS1_BF FRS1_H
 #define FRS1_F READ_FREG_F(insn.rs1())
 #define FRS1_D READ_FREG_D(insn.rs1())
 #define FRS2_H READ_FREG_H(insn.rs2())
@@ -95,6 +96,7 @@ do { \
     WRITE_FRD(value); \
   } \
 } while (0)
+#define WRITE_FRD_BF WRITE_FRD_H
 #define WRITE_FRD_F(value) \
 do { \
   if (p->extension_enabled(EXT_ZFINX)) \
diff --git a/riscv/insns/fcvt_bf16_s.h b/riscv/insns/fcvt_bf16_s.h
new file mode 100644
index 0000000000..d625df893d
--- /dev/null
+++ b/riscv/insns/fcvt_bf16_s.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFBFMIN);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD_BF(f32_to_bf16(FRS1_F));
+set_fp_exceptions;
diff --git a/riscv/insns/fcvt_s_bf16.h b/riscv/insns/fcvt_s_bf16.h
new file mode 100644
index 0000000000..59a55cb191
--- /dev/null
+++ b/riscv/insns/fcvt_s_bf16.h
@@ -0,0 +1,5 @@
+require_extension(EXT_ZFBFMIN);
+require_fp;
+softfloat_roundingMode = RM;
+WRITE_FRD_F(bf16_to_f32(FRS1_BF));
+set_fp_exceptions;
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 55fadc0258..9e49c89d86 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1356,8 +1356,15 @@ riscv_insn_ext_cmo = \
 	cbo_zero \
 
 riscv_insn_ext_zicond = \
-        czero_eqz \
-        czero_nez \
+	czero_eqz \
+	czero_nez \
+
+riscv_insn_ext_zfbfmin = \
+	fcvt_bf16_s \
+	fcvt_s_bf16 \
+
+riscv_insn_ext_bf16 = \
+	$(riscv_insn_ext_zfbfmin) \
 
 riscv_insn_list = \
 	$(riscv_insn_ext_a) \
@@ -1383,6 +1390,7 @@ riscv_insn_list = \
 	$(riscv_insn_smrnmi) \
 	$(riscv_insn_ext_cmo) \
 	$(riscv_insn_ext_zicond) \
+	$(riscv_insn_ext_bf16) \
 
 riscv_gen_srcs = $(addsuffix .cc,$(riscv_insn_list))
 

From 8aacc4effde92122a25beadac594162187767d7e Mon Sep 17 00:00:00 2001
From: Weiwei Li <liweiwei@iscas.ac.cn>
Date: Fri, 14 Apr 2023 22:47:51 +0800
Subject: [PATCH 048/110] Add support for new instructions of Zvfbfmin
 extension

---
 riscv/insns/vfncvtbf16_f_f_w.h |  5 +++++
 riscv/insns/vfwcvtbf16_f_f_v.h |  5 +++++
 riscv/riscv.mk.in              |  5 +++++
 riscv/v_ext_macros.h           | 22 ++++++++++++++++++++++
 4 files changed, 37 insertions(+)
 create mode 100644 riscv/insns/vfncvtbf16_f_f_w.h
 create mode 100644 riscv/insns/vfwcvtbf16_f_f_v.h

diff --git a/riscv/insns/vfncvtbf16_f_f_w.h b/riscv/insns/vfncvtbf16_f_f_w.h
new file mode 100644
index 0000000000..4708802518
--- /dev/null
+++ b/riscv/insns/vfncvtbf16_f_f_w.h
@@ -0,0 +1,5 @@
+// vfncvtbf16.f.f.w vd, vs2, vm
+VI_VFP_NCVT_BF16_TO_FP(
+  { vd = f32_to_bf16(vs2); },           // BODY16
+  { require_extension(EXT_ZVFBFMIN); }  // CHECK16
+)
diff --git a/riscv/insns/vfwcvtbf16_f_f_v.h b/riscv/insns/vfwcvtbf16_f_f_v.h
new file mode 100644
index 0000000000..ee9a59ca92
--- /dev/null
+++ b/riscv/insns/vfwcvtbf16_f_f_v.h
@@ -0,0 +1,5 @@
+// vfwcvtbf16.f.f.v vd, vs2, vm
+VI_VFP_WCVT_FP_TO_BF16(
+  { vd = bf16_to_f32(vs2); },           // BODY16
+  { require_extension(EXT_ZVFBFMIN); }  // CHECK16
+)
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 9e49c89d86..a83bec280c 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1363,8 +1363,13 @@ riscv_insn_ext_zfbfmin = \
 	fcvt_bf16_s \
 	fcvt_s_bf16 \
 
+riscv_insn_ext_zvfbfmin = \
+	vfncvtbf16_f_f_w \
+	vfwcvtbf16_f_f_v \
+
 riscv_insn_ext_bf16 = \
 	$(riscv_insn_ext_zfbfmin) \
+	$(riscv_insn_ext_zvfbfmin) \
 
 riscv_insn_list = \
 	$(riscv_insn_ext_a) \
diff --git a/riscv/v_ext_macros.h b/riscv/v_ext_macros.h
index 8b0d0fde11..376c3307b9 100644
--- a/riscv/v_ext_macros.h
+++ b/riscv/v_ext_macros.h
@@ -1980,6 +1980,17 @@ reg_t index[P.VU.vlmax]; \
       break; \
   }
 
+#define VI_VFP_WCVT_FP_TO_BF16(BODY, CHECK) \
+  VI_CHECK_DSS(false); \
+  switch (P.VU.vsew) { \
+    case e16: \
+      { VI_VFP_CVT_LOOP(CVT_FP_TO_FP_PARAMS(16, 32), CHECK, BODY); } \
+      break; \
+    default: \
+      require(0); \
+      break; \
+  }
+
 #define VI_VFP_WCVT_INT_TO_FP(BODY8, BODY16, BODY32, \
                               CHECK8, CHECK16, CHECK32, \
                               sign) \
@@ -2030,6 +2041,17 @@ reg_t index[P.VU.vlmax]; \
       break; \
   }
 
+#define VI_VFP_NCVT_BF16_TO_FP(BODY, CHECK) \
+  VI_CHECK_SDS(false); \
+  switch (P.VU.vsew) { \
+    case e16: \
+      { VI_VFP_CVT_LOOP(CVT_FP_TO_FP_PARAMS(32, 16), CHECK, BODY); } \
+      break; \
+    default: \
+      require(0); \
+      break; \
+  }
+
 #define VI_VFP_NCVT_INT_TO_FP(BODY32, BODY64, \
                               CHECK32, CHECK64, \
                               sign) \

From 48f66191758f3bca04e6d7e85348f266df148c14 Mon Sep 17 00:00:00 2001
From: Weiwei Li <liweiwei@iscas.ac.cn>
Date: Fri, 14 Apr 2023 22:49:24 +0800
Subject: [PATCH 049/110] Add support for new instructions of Zvfbfwma
 extension

---
 riscv/insns/vfwmaccbf16_vf.h |  5 ++++
 riscv/insns/vfwmaccbf16_vv.h |  5 ++++
 riscv/riscv.mk.in            |  5 ++++
 riscv/v_ext_macros.h         | 54 ++++++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+)
 create mode 100644 riscv/insns/vfwmaccbf16_vf.h
 create mode 100644 riscv/insns/vfwmaccbf16_vv.h

diff --git a/riscv/insns/vfwmaccbf16_vf.h b/riscv/insns/vfwmaccbf16_vf.h
new file mode 100644
index 0000000000..2c77b3be18
--- /dev/null
+++ b/riscv/insns/vfwmaccbf16_vf.h
@@ -0,0 +1,5 @@
+// vfwmaccbf16.vf vd, vs2, rs1
+VI_VFP_BF16_VF_LOOP_WIDE
+({
+  vd = f32_mulAdd(rs1, vs2, vd);
+})
diff --git a/riscv/insns/vfwmaccbf16_vv.h b/riscv/insns/vfwmaccbf16_vv.h
new file mode 100644
index 0000000000..bd8f30505d
--- /dev/null
+++ b/riscv/insns/vfwmaccbf16_vv.h
@@ -0,0 +1,5 @@
+// vfwmaccbf16.vv vd, vs2, vs1
+VI_VFP_BF16_VV_LOOP_WIDE
+({
+  vd = f32_mulAdd(vs1, vs2, vd);
+})
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index a83bec280c..1cfe6275f0 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1367,9 +1367,14 @@ riscv_insn_ext_zvfbfmin = \
 	vfncvtbf16_f_f_w \
 	vfwcvtbf16_f_f_v \
 
+riscv_insn_ext_zvfbfwma = \
+	vfwmaccbf16_vv \
+	vfwmaccbf16_vf \
+
 riscv_insn_ext_bf16 = \
 	$(riscv_insn_ext_zfbfmin) \
 	$(riscv_insn_ext_zvfbfmin) \
+	$(riscv_insn_ext_zvfbfwma) \
 
 riscv_insn_list = \
 	$(riscv_insn_ext_a) \
diff --git a/riscv/v_ext_macros.h b/riscv/v_ext_macros.h
index 376c3307b9..41256c7a59 100644
--- a/riscv/v_ext_macros.h
+++ b/riscv/v_ext_macros.h
@@ -1488,11 +1488,27 @@ reg_t index[P.VU.vlmax]; \
   reg_t UNUSED rs2_num = insn.rs2(); \
   softfloat_roundingMode = STATE.frm->read();
 
+#define VI_VFP_BF16_COMMON \
+  require_fp; \
+  require((P.VU.vsew == e16 && p->extension_enabled(EXT_ZVFBFWMA))); \
+  require_vector(true); \
+  require(STATE.frm->read() < 0x5); \
+  reg_t UNUSED vl = P.VU.vl->read(); \
+  reg_t UNUSED rd_num = insn.rd(); \
+  reg_t UNUSED rs1_num = insn.rs1(); \
+  reg_t UNUSED rs2_num = insn.rs2(); \
+  softfloat_roundingMode = STATE.frm->read();
+
 #define VI_VFP_LOOP_BASE \
   VI_VFP_COMMON \
   for (reg_t i = P.VU.vstart->read(); i < vl; ++i) { \
     VI_LOOP_ELEMENT_SKIP();
 
+#define VI_VFP_BF16_LOOP_BASE \
+  VI_VFP_BF16_COMMON \
+  for (reg_t i = P.VU.vstart->read(); i < vl; ++i) { \
+    VI_LOOP_ELEMENT_SKIP();
+
 #define VI_VFP_LOOP_CMP_BASE \
   VI_VFP_COMMON \
   for (reg_t i = P.VU.vstart->read(); i < vl; ++i) { \
@@ -1818,6 +1834,25 @@ reg_t index[P.VU.vlmax]; \
   DEBUG_RVV_FP_VV; \
   VI_VFP_LOOP_END
 
+#define VI_VFP_BF16_VF_LOOP_WIDE(BODY) \
+  VI_CHECK_DSS(false); \
+  VI_VFP_BF16_LOOP_BASE \
+  switch (P.VU.vsew) { \
+    case e16: { \
+      float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
+      float32_t vs2 = bf16_to_f32(P.VU.elt<bfloat16_t>(rs2_num, i)); \
+      float32_t rs1 = bf16_to_f32(FRS1_BF); \
+      BODY; \
+      set_fp_exceptions; \
+      break; \
+    } \
+    default: \
+      require(0); \
+      break; \
+  }; \
+  DEBUG_RVV_FP_VV; \
+  VI_VFP_LOOP_END
+
 #define VI_VFP_VV_LOOP_WIDE(BODY16, BODY32) \
   VI_CHECK_DSS(true); \
   VI_VFP_LOOP_BASE \
@@ -1845,6 +1880,25 @@ reg_t index[P.VU.vlmax]; \
   DEBUG_RVV_FP_VV; \
   VI_VFP_LOOP_END
 
+#define VI_VFP_BF16_VV_LOOP_WIDE(BODY) \
+  VI_CHECK_DSS(true); \
+  VI_VFP_BF16_LOOP_BASE \
+  switch (P.VU.vsew) { \
+    case e16: { \
+      float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
+      float32_t vs2 = bf16_to_f32(P.VU.elt<bfloat16_t>(rs2_num, i)); \
+      float32_t vs1 = bf16_to_f32(P.VU.elt<bfloat16_t>(rs1_num, i)); \
+      BODY; \
+      set_fp_exceptions; \
+      break; \
+    } \
+    default: \
+      require(0); \
+      break; \
+  }; \
+  DEBUG_RVV_FP_VV; \
+  VI_VFP_LOOP_END
+
 #define VI_VFP_WF_LOOP_WIDE(BODY16, BODY32) \
   VI_CHECK_DDS(false); \
   VI_VFP_LOOP_BASE \

From 62478900e5f7fd872a2fea5c7b4098a570776e18 Mon Sep 17 00:00:00 2001
From: Weiwei Li <liweiwei@iscas.ac.cn>
Date: Fri, 14 Apr 2023 22:50:43 +0800
Subject: [PATCH 050/110] Add flh/fsh/fmv_h_x/fmv_x_h instructions to
 Zvfbfmin/Zvfbfwma extensions

---
 riscv/insns/flh.h     | 2 +-
 riscv/insns/fmv_h_x.h | 2 +-
 riscv/insns/fmv_x_h.h | 2 +-
 riscv/insns/fsh.h     | 2 +-
 riscv/isa_parser.cc   | 4 ++++
 riscv/isa_parser.h    | 1 +
 6 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/riscv/insns/flh.h b/riscv/insns/flh.h
index befff2cd3a..67b538a797 100644
--- a/riscv/insns/flh.h
+++ b/riscv/insns/flh.h
@@ -1,3 +1,3 @@
-require_extension(EXT_ZFHMIN);
+require_extension(EXT_INTERNAL_ZFH_MOVE);
 require_fp;
 WRITE_FRD(f16(MMU.load<uint16_t>(RS1 + insn.i_imm())));
diff --git a/riscv/insns/fmv_h_x.h b/riscv/insns/fmv_h_x.h
index e55d607b17..bc2155cd21 100644
--- a/riscv/insns/fmv_h_x.h
+++ b/riscv/insns/fmv_h_x.h
@@ -1,3 +1,3 @@
-require_extension(EXT_ZFHMIN);
+require_extension(EXT_INTERNAL_ZFH_MOVE);
 require_fp;
 WRITE_FRD(f16(RS1));
diff --git a/riscv/insns/fmv_x_h.h b/riscv/insns/fmv_x_h.h
index 7a2e5ff6bf..ca823c17e5 100644
--- a/riscv/insns/fmv_x_h.h
+++ b/riscv/insns/fmv_x_h.h
@@ -1,3 +1,3 @@
-require_extension(EXT_ZFHMIN);
+require_extension(EXT_INTERNAL_ZFH_MOVE);
 require_fp;
 WRITE_RD(sext32((int16_t)(FRS1.v[0])));
diff --git a/riscv/insns/fsh.h b/riscv/insns/fsh.h
index dfd6bc5ca9..142d4d41de 100644
--- a/riscv/insns/fsh.h
+++ b/riscv/insns/fsh.h
@@ -1,3 +1,3 @@
-require_extension(EXT_ZFHMIN);
+require_extension(EXT_INTERNAL_ZFH_MOVE);
 require_fp;
 MMU.store<uint16_t>(RS1 + insn.s_imm(), FRS2.v[0]);
diff --git a/riscv/isa_parser.cc b/riscv/isa_parser.cc
index be5e51b7be..bd73b0c39f 100644
--- a/riscv/isa_parser.cc
+++ b/riscv/isa_parser.cc
@@ -293,6 +293,10 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
     bad_isa_string(str, "'Zvfbfmin/Zvfbfwma' extension requires 'V' extension");
   }
 
+  if (extension_table[EXT_ZFBFMIN] || extension_table[EXT_ZVFBFMIN] || extension_table[EXT_ZFHMIN]) {
+    extension_table[EXT_INTERNAL_ZFH_MOVE] = true;
+  }
+
   if (extension_table['C']) {
     extension_table[EXT_ZCA] = true;
     if (extension_table['F'] && max_xlen == 32)
diff --git a/riscv/isa_parser.h b/riscv/isa_parser.h
index 04859b6ec4..7558116869 100644
--- a/riscv/isa_parser.h
+++ b/riscv/isa_parser.h
@@ -69,6 +69,7 @@ typedef enum {
   EXT_XZBR,
   EXT_XZBT,
   EXT_SSTC,
+  EXT_INTERNAL_ZFH_MOVE,
   NUM_ISA_EXTENSIONS
 } isa_extension_t;
 

From a2e8ad3d5f0bc8856da947df7c216d114179dc34 Mon Sep 17 00:00:00 2001
From: Weiwei Li <liweiwei@iscas.ac.cn>
Date: Fri, 14 Apr 2023 22:51:19 +0800
Subject: [PATCH 051/110] Add dsasm support for BF16 extensions

---
 disasm/disasm.cc | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/disasm/disasm.cc b/disasm/disasm.cc
index fef9facab4..d797a91dc1 100644
--- a/disasm/disasm.cc
+++ b/disasm/disasm.cc
@@ -1184,14 +1184,17 @@ void disassembler_t::add_instructions(const isa_parser_t* isa)
   }
 
   if (isa->extension_enabled(EXT_ZFHMIN)) {
-    DEFINE_FLOAD(flh)
-    DEFINE_FSTORE(fsh)
     DEFINE_FR1TYPE(fcvt_h_s);
     DEFINE_FR1TYPE(fcvt_h_d);
     DEFINE_FR1TYPE(fcvt_h_q);
     DEFINE_FR1TYPE(fcvt_s_h);
     DEFINE_FR1TYPE(fcvt_d_h);
     DEFINE_FR1TYPE(fcvt_q_h);
+  }
+
+  if (isa->extension_enabled(EXT_INTERNAL_ZFH_MOVE)) {
+    DEFINE_FLOAD(flh)
+    DEFINE_FSTORE(fsh)
     DEFINE_XFTYPE(fmv_h_x);
     DEFINE_FXTYPE(fmv_x_h);
   }
@@ -1239,6 +1242,11 @@ void disassembler_t::add_instructions(const isa_parser_t* isa)
     DEFINE_FX2TYPE(fle_q);
   }
 
+  if (isa->extension_enabled(EXT_ZFBFMIN)) {
+    DEFINE_FR1TYPE(fcvt_bf16_s);
+    DEFINE_FR1TYPE(fcvt_s_bf16);
+  }
+
   // ext-h
   if (isa->extension_enabled('H')) {
     DEFINE_XLOAD_BASE(hlv_b)
@@ -1787,6 +1795,16 @@ void disassembler_t::add_instructions(const isa_parser_t* isa)
     }
   }
 
+  if (isa->extension_enabled(EXT_ZVFBFMIN)) {
+    DEFINE_VECTOR_V(vfncvtbf16_f_f_w);
+    DEFINE_VECTOR_V(vfwcvtbf16_f_f_v);
+  }
+
+  if (isa->extension_enabled(EXT_ZVFBFWMA)) {
+    DEFINE_VECTOR_VV(vfwmaccbf16_vv);
+    DEFINE_VECTOR_VF(vfwmaccbf16_vf);
+  }
+
 #define DEFINE_PI3TYPE(code) add_pitype3_insn(this, #code, match_##code, mask_##code);
 #define DEFINE_PI4TYPE(code) add_pitype4_insn(this, #code, match_##code, mask_##code);
 #define DEFINE_PI5TYPE(code) add_pitype5_insn(this, #code, match_##code, mask_##code);

From 8e800d05a4e4322048c942a9925684d6ce4f9de4 Mon Sep 17 00:00:00 2001
From: Weiwei Li <liweiwei@iscas.ac.cn>
Date: Mon, 29 May 2023 09:04:10 +0800
Subject: [PATCH 052/110] Add BF16 extensions to README.md

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 9455bfae6e..8d5dc45b94 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,9 @@ Spike supports the following RISC-V ISA features:
   - Zcd extension, v1.0
   - Zcmp extension, v1.0
   - Zcmt extension, v1.0
+  - Zfbfmin extension, v0.6
+  - Zvfbfmin extension, v0.6
+  - Zvfbfwma extension, v0.6
 
 As a Spike extension, the remainder of the proposed
 [Bit-Manipulation Extensions](https://github.com/riscv/riscv-bitmanip)

From bb101c7a2c1bd751a34d65ea441faede408fa3d7 Mon Sep 17 00:00:00 2001
From: Tim Newsome <tim@sifive.com>
Date: Thu, 1 Jun 2023 13:19:37 -0700
Subject: [PATCH 053/110] dscr.ebreakh is now dcsr.ebreakv[su]

This change was made ages ago in the spec.

I did not actually test that the new privilege checks in ebreak and
c.ebreak are correct, but all the existing debug tests still pass.
---
 riscv/csrs.cc          |  9 ++++++---
 riscv/csrs.h           |  3 ++-
 riscv/insns/c_ebreak.h | 10 ++++++----
 riscv/insns/ebreak.h   | 10 ++++++----
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/riscv/csrs.cc b/riscv/csrs.cc
index 95b5e22348..7ea07d104c 100644
--- a/riscv/csrs.cc
+++ b/riscv/csrs.cc
@@ -1232,9 +1232,10 @@ dcsr_csr_t::dcsr_csr_t(processor_t* const proc, const reg_t addr):
   prv(0),
   step(false),
   ebreakm(false),
-  ebreakh(false),
   ebreaks(false),
   ebreaku(false),
+  ebreakvs(false),
+  ebreakvu(false),
   halt(false),
   v(false),
   cause(0) {
@@ -1250,9 +1251,10 @@ reg_t dcsr_csr_t::read() const noexcept {
   reg_t result = 0;
   result = set_field(result, DCSR_XDEBUGVER, 1);
   result = set_field(result, DCSR_EBREAKM, ebreakm);
-  result = set_field(result, DCSR_EBREAKH, ebreakh);
   result = set_field(result, DCSR_EBREAKS, ebreaks);
   result = set_field(result, DCSR_EBREAKU, ebreaku);
+  result = set_field(result, CSR_DCSR_EBREAKVS, ebreakvs);
+  result = set_field(result, CSR_DCSR_EBREAKVU, ebreakvu);
   result = set_field(result, DCSR_STOPCYCLE, 0);
   result = set_field(result, DCSR_STOPTIME, 0);
   result = set_field(result, DCSR_CAUSE, cause);
@@ -1267,9 +1269,10 @@ bool dcsr_csr_t::unlogged_write(const reg_t val) noexcept {
   step = get_field(val, DCSR_STEP);
   // TODO: ndreset and fullreset
   ebreakm = get_field(val, DCSR_EBREAKM);
-  ebreakh = get_field(val, DCSR_EBREAKH);
   ebreaks = get_field(val, DCSR_EBREAKS);
   ebreaku = get_field(val, DCSR_EBREAKU);
+  ebreakvs = get_field(val, CSR_DCSR_EBREAKVS);
+  ebreakvu = get_field(val, CSR_DCSR_EBREAKVU);
   halt = get_field(val, DCSR_HALT);
   v = proc->extension_enabled('H') ? get_field(val, CSR_DCSR_V) : false;
   return true;
diff --git a/riscv/csrs.h b/riscv/csrs.h
index 19aefca139..07d6d82ac5 100644
--- a/riscv/csrs.h
+++ b/riscv/csrs.h
@@ -663,9 +663,10 @@ class dcsr_csr_t: public csr_t {
   uint8_t prv;
   bool step;
   bool ebreakm;
-  bool ebreakh;
   bool ebreaks;
   bool ebreaku;
+  bool ebreakvs;
+  bool ebreakvu;
   bool halt;
   bool v;
   uint8_t cause;
diff --git a/riscv/insns/c_ebreak.h b/riscv/insns/c_ebreak.h
index 14b5136310..4ea27a751e 100644
--- a/riscv/insns/c_ebreak.h
+++ b/riscv/insns/c_ebreak.h
@@ -1,8 +1,10 @@
 require_extension(EXT_ZCA);
-if (!STATE.debug_mode &&
-    ((STATE.prv == PRV_M && STATE.dcsr->ebreakm) ||
-     (STATE.prv == PRV_S && STATE.dcsr->ebreaks) ||
-     (STATE.prv == PRV_U && STATE.dcsr->ebreaku))) {
+if (!STATE.debug_mode && (
+        (!STATE.v && STATE.prv == PRV_M && STATE.dcsr->ebreakm) ||
+        (!STATE.v && STATE.prv == PRV_S && STATE.dcsr->ebreaks) ||
+        (!STATE.v && STATE.prv == PRV_U && STATE.dcsr->ebreaku) ||
+        (STATE.v && STATE.prv == PRV_S && STATE.dcsr->ebreakvs) ||
+        (STATE.v && STATE.prv == PRV_U && STATE.dcsr->ebreakvu))) {
 	throw trap_debug_mode();
 } else {
 	throw trap_breakpoint(STATE.v, pc);
diff --git a/riscv/insns/ebreak.h b/riscv/insns/ebreak.h
index 227ab93527..0cd2f190fe 100644
--- a/riscv/insns/ebreak.h
+++ b/riscv/insns/ebreak.h
@@ -1,7 +1,9 @@
-if (!STATE.debug_mode &&
-    ((STATE.prv == PRV_M && STATE.dcsr->ebreakm) ||
-     (STATE.prv == PRV_S && STATE.dcsr->ebreaks) ||
-     (STATE.prv == PRV_U && STATE.dcsr->ebreaku))) {
+if (!STATE.debug_mode && (
+        (!STATE.v && STATE.prv == PRV_M && STATE.dcsr->ebreakm) ||
+        (!STATE.v && STATE.prv == PRV_S && STATE.dcsr->ebreaks) ||
+        (!STATE.v && STATE.prv == PRV_U && STATE.dcsr->ebreaku) ||
+        (STATE.v && STATE.prv == PRV_S && STATE.dcsr->ebreakvs) ||
+        (STATE.v && STATE.prv == PRV_U && STATE.dcsr->ebreakvu))) {
 	throw trap_debug_mode();
 } else {
 	throw trap_breakpoint(STATE.v, pc);

From 047491581cd6437620a19b51594a35c158a53466 Mon Sep 17 00:00:00 2001
From: Weiwei Li <liweiwei@iscas.ac.cn>
Date: Fri, 2 Jun 2023 23:41:49 +0800
Subject: [PATCH 054/110] Fix bugs in disassembling code for cm.mva01s/mvsa01
 instructions. (Resolved issue #1370)

---
 disasm/disasm.cc | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/disasm/disasm.cc b/disasm/disasm.cc
index fef9facab4..2fce1a1d10 100644
--- a/disasm/disasm.cc
+++ b/disasm/disasm.cc
@@ -236,6 +236,18 @@ struct : public arg_t {
   }
 } rvc_rs2s;
 
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return xpr_name[RVC_R1S];
+  }
+} rvc_r1s;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return xpr_name[RVC_R2S];
+  }
+} rvc_r2s;
+
 struct : public arg_t {
   std::string to_string(insn_t insn) const {
     return fpr_name[insn.rvc_rs2s()];
@@ -1347,8 +1359,8 @@ void disassembler_t::add_instructions(const isa_parser_t* isa)
       DISASM_INSN("cm.popretz", cm_popretz, 0, {&rvcm_pushpop_rlist, &rvcm_pop_stack_adj_64});
     }
 
-    DISASM_INSN("cm.mva01s", cm_mva01s, 0, {&rvc_rs1s, &rvc_rs2s});
-    DISASM_INSN("cm.mvsa01", cm_mvsa01, 0, {&rvc_rs1s, &rvc_rs2s});
+    DISASM_INSN("cm.mva01s", cm_mva01s, 0, {&rvc_r1s, &rvc_r2s});
+    DISASM_INSN("cm.mvsa01", cm_mvsa01, 0, {&rvc_r1s, &rvc_r2s});
   }
 
   if (isa->extension_enabled(EXT_ZCMT)) {

From cf7e434c8005fc79f563be98542aa1d42a85f869 Mon Sep 17 00:00:00 2001
From: "demin.han" <demin.han@starfivetech.com>
Date: Thu, 8 Jun 2023 10:51:31 +0800
Subject: [PATCH 055/110] Replace ternary operator with std:min

---
 riscv/vector_unit.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/riscv/vector_unit.cc b/riscv/vector_unit.cc
index ff3dd82fb9..9128df63ee 100644
--- a/riscv/vector_unit.cc
+++ b/riscv/vector_unit.cc
@@ -54,11 +54,11 @@ reg_t vectorUnit_t::vectorUnit_t::set_vl(int rd, int rs1, reg_t reqVL, reg_t new
   if (vlmax == 0) {
     vl->write_raw(0);
   } else if (rd == 0 && rs1 == 0) {
-    vl->write_raw(vl->read() > vlmax ? vlmax : vl->read());
+    vl->write_raw(std::min(vl->read(), vlmax));
   } else if (rd != 0 && rs1 == 0) {
     vl->write_raw(vlmax);
   } else if (rs1 != 0) {
-    vl->write_raw(reqVL > vlmax ? vlmax : reqVL);
+    vl->write_raw(std::min(reqVL, vlmax));
   }
 
   vstart->write_raw(0);

From 03b47351e69ae954e5b078e18cc10bf21df4712c Mon Sep 17 00:00:00 2001
From: YenHaoChen <howard25336284@gmail.com>
Date: Fri, 9 Jun 2023 09:28:15 +0800
Subject: [PATCH 056/110] Fix PMP checking region of cache-block management
 instructions

The spec says "The PMP access control bits shall be the same for all
physical addresses in the cache block [... else] the behavior of a CBO
instruction is UNSPECIFIED."

Thus, we only need to check the byte rs1 points to
(instead of the entire cache block).
---
 riscv/mmu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/riscv/mmu.h b/riscv/mmu.h
index 5a4835c3ce..efc6e9de14 100644
--- a/riscv/mmu.h
+++ b/riscv/mmu.h
@@ -214,7 +214,7 @@ class mmu_t
 
   void clean_inval(reg_t addr, bool clean, bool inval) {
     convert_load_traps_to_store_traps({
-        const reg_t paddr = translate(generate_access_info(addr, LOAD, {false, false, false}), blocksz) & ~(blocksz - 1);
+      const reg_t paddr = translate(generate_access_info(addr, LOAD, {false, false, false}), 1);
       if (sim->reservable(paddr)) {
         if (tracer.interested_in_range(paddr, paddr + PGSIZE, LOAD))
           tracer.clean_invalidate(paddr, blocksz, clean, inval);

From cfe79e06fbfbe2d598693e7aa035a1f6e823d71c Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Mon, 12 Jun 2023 15:52:45 -0700
Subject: [PATCH 057/110] ci: CI should check each commit in a PR

test
---
 .github/workflows/continuous-integration.yml | 22 ++++++++++++++++----
 ci-tests/build-spike                         |  2 ++
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml
index 51b65a1e05..d8c9a0255d 100644
--- a/.github/workflows/continuous-integration.yml
+++ b/.github/workflows/continuous-integration.yml
@@ -21,23 +21,37 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v2
+        with:
+          # checkout full tree
+          fetch-depth: 0
 
       - name: Install Dependencies
         run: sudo xargs apt-get install -y < .github/workflows/apt-packages.txt
 
       - run: |
-          ci-tests/build-spike
-          ci-tests/test-spike
+          for commit in $(git rev-list origin/master..HEAD); do
+            git checkout $commit
+            echo "Checking commit $commit"
+            ci-tests/build-spike
+            ci-tests/test-spike
+          done
 
   test-macos:
     name: Test Spike build (MacOS)
     runs-on: macos-12
     steps:
       - uses: actions/checkout@v2
+        with:
+          # checkout full tree
+          fetch-depth: 0
 
       - name: Install Dependencies
         run: xargs brew install < .github/workflows/brew-packages.txt
 
       - run: |
-          ci-tests/build-spike
-          ci-tests/test-spike
+          for commit in $(git rev-list origin/master..HEAD); do
+            git checkout $commit
+            echo "Checking commit $commit"
+            ci-tests/build-spike
+            ci-tests/test-spike
+          done
diff --git a/ci-tests/build-spike b/ci-tests/build-spike
index 5eb7b58030..9c3fb373a3 100755
--- a/ci-tests/build-spike
+++ b/ci-tests/build-spike
@@ -3,6 +3,8 @@ set -e
 
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
+rm -rf build
+
 mkdir build
 cd build
 mkdir install

From 903ec29f902da41537411e210e7b6002eed7fb7e Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Sat, 17 Jun 2023 16:49:27 -0700
Subject: [PATCH 058/110] Remove legacy debug test

These are now tested in CI using the riscv-tests repository.
---
 tests/ebreak.py  |  26 -----------
 tests/ebreak.s   |   5 --
 tests/testlib.py | 116 -----------------------------------------------
 3 files changed, 147 deletions(-)
 delete mode 100755 tests/ebreak.py
 delete mode 100644 tests/ebreak.s
 delete mode 100644 tests/testlib.py

diff --git a/tests/ebreak.py b/tests/ebreak.py
deleted file mode 100755
index dd7e65878b..0000000000
--- a/tests/ebreak.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/python
-
-import os
-import testlib
-import unittest
-import tempfile
-import time
-
-class EbreakTest(unittest.TestCase):
-    def setUp(self):
-        self.binary = testlib.compile("ebreak.s")
-
-    def test_noport(self):
-        """Make sure that we can run past ebreak when --gdb-port isn't used."""
-        spike = testlib.Spike(self.binary, with_gdb=False, timeout=10)
-        result = spike.wait()
-        self.assertEqual(result, 0)
-
-    def test_nogdb(self):
-        """Make sure that we can run past ebreak when gdb isn't attached."""
-        spike = testlib.Spike(self.binary, timeout=10)
-        result = spike.wait()
-        self.assertEqual(result, 0)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/ebreak.s b/tests/ebreak.s
deleted file mode 100644
index 99f3e07ccd..0000000000
--- a/tests/ebreak.s
+++ /dev/null
@@ -1,5 +0,0 @@
-        .global main
-main:
-        li      a0, 0
-        ebreak
-        ret
diff --git a/tests/testlib.py b/tests/testlib.py
deleted file mode 100644
index d5e8d795c9..0000000000
--- a/tests/testlib.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import os.path
-import pexpect
-import subprocess
-import tempfile
-import testlib
-import unittest
-
-# Note that gdb comes with its own testsuite. I was unable to figure out how to
-# run that testsuite against the spike simulator.
-
-def find_file(path):
-    for directory in (os.getcwd(), os.path.dirname(testlib.__file__)):
-        fullpath = os.path.join(directory, path)
-        if os.path.exists(fullpath):
-            return fullpath
-    return None
-
-def compile(*args):
-    """Compile a single .c file into a binary."""
-    dst = os.path.splitext(args[0])[0]
-    cc = os.path.expandvars("$RISCV/bin/riscv64-unknown-elf-gcc")
-    cmd = [cc, "-g", "-O", "-o", dst]
-    for arg in args:
-        found = find_file(arg)
-        if found:
-            cmd.append(found)
-        else:
-            cmd.append(arg)
-    cmd = " ".join(cmd)
-    result = os.system(cmd)
-    assert result == 0, "%r failed" % cmd
-    return dst
-
-def unused_port():
-    # http://stackoverflow.com/questions/2838244/get-open-tcp-port-in-python/2838309#2838309
-    import socket
-    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    s.bind(("",0))
-    port = s.getsockname()[1]
-    s.close()
-    return port
-
-class Spike(object):
-    def __init__(self, binary, halted=False, with_gdb=True, timeout=None):
-        """Launch spike. Return tuple of its process and the port it's running on."""
-        cmd = []
-        if timeout:
-            cmd += ["timeout", str(timeout)]
-
-        cmd += [find_file("spike")]
-        if halted:
-            cmd.append('-H')
-        if with_gdb:
-            self.port = unused_port()
-            cmd += ['--gdb-port', str(self.port)]
-        cmd.append('pk')
-        if binary:
-            cmd.append(binary)
-        logfile = open("spike.log", "w")
-        self.process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=logfile,
-                stderr=logfile)
-
-    def __del__(self):
-        try:
-            self.process.kill()
-            self.process.wait()
-        except OSError:
-            pass
-
-    def wait(self, *args, **kwargs):
-        return self.process.wait(*args, **kwargs)
-
-class Gdb(object):
-    def __init__(self):
-        path = os.path.expandvars("$RISCV/bin/riscv64-unknown-elf-gdb")
-        self.child = pexpect.spawn(path)
-        self.child.logfile = file("gdb.log", "w")
-        self.wait()
-        self.command("set width 0")
-        self.command("set height 0")
-        # Force consistency.
-        self.command("set print entry-values no")
-
-    def wait(self):
-        """Wait for prompt."""
-        self.child.expect("\(gdb\)")
-
-    def command(self, command, timeout=-1):
-        self.child.sendline(command)
-        self.child.expect("\n", timeout=timeout)
-        self.child.expect("\(gdb\)", timeout=timeout)
-        return self.child.before.strip()
-
-    def c(self, wait=True):
-        if wait:
-            return self.command("c")
-        else:
-            self.child.sendline("c")
-            self.child.expect("Continuing")
-
-    def interrupt(self):
-        self.child.send("\003");
-        self.child.expect("\(gdb\)")
-
-    def x(self, address, size='w'):
-        output = self.command("x/%s %s" % (size, address))
-        value = int(output.split(':')[1].strip(), 0)
-        return value
-
-    def p(self, obj):
-        output = self.command("p %s" % obj)
-        value = int(output.split('=')[-1].strip())
-        return value
-
-    def stepi(self):
-        return self.command("stepi")

From 396c61f54e54a2be846c98a0a3489c107cbd8bbb Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Sat, 17 Jun 2023 16:49:57 -0700
Subject: [PATCH 059/110] Restore MCPPBS unit-testing flow

---
 Makefile.in | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/Makefile.in b/Makefile.in
index c922e849bd..69f0405d7b 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -256,7 +256,7 @@ $(2)_test_objs      := $$(patsubst %.cc, %.o, $$($(2)_test_srcs))
 $(2)_test_deps      := $$(patsubst %.o, %.d, $$($(2)_test_objs))
 $(2)_test_exes      := $$(patsubst %.t.cc, %-utst, $$($(2)_test_srcs))
 $(2)_test_outs      := $$(patsubst %, %.out, $$($(2)_test_exes))
-$(2)_test_libs      := $(1) $$($(2)_reverse_deps) utst
+$(2)_test_libs      := $(1) $$($(2)_reverse_deps)
 $(2)_test_libnames  := $$(patsubst %, lib%.a, $$($(2)_test_libs))
 $(2)_test_libarg    := $$(patsubst %, -l%, $$($(2)_test_libs))
 
@@ -274,7 +274,8 @@ $(2)_junk += \
 # Run unit tests
 
 $$($(2)_test_outs) : %.out : %
-	$(RUN) $(RUNFLAGS) ./$$< default | tee $$@
+	./$$< default
+	touch $$@
 
 $(2)_junk += $$($(2)_test_outs)
 
@@ -359,20 +360,8 @@ deps : $(deps)
 # Check
 #-------------------------------------------------------------------------
 
-bintest_outs = $(bintests:=.out)
-junk += $(bintest_outs)
-%.out: % all
-	./$* < /dev/null 2>&1 | tee $@
-
-check-cpp : $(test_outs)
-	@echo
-	! grep -h -e'Unit Tests' -e'FAILED' -e'Segmentation' $^ < /dev/null
-	@echo
-
-check-bin : $(bintest_outs)
-	! tail -n 1 $^ < /dev/null 2>&1 | grep FAILED
-
-check : check-cpp check-bin
+check : $(test_outs)
+	echo; grep -h -e'Unit Tests' -e'FAILED' -e'Segementation' $^ < /dev/null; echo
 
 .PHONY : check
 

From e58d89aa2c38ca40e68ad4e010c91239c4794e00 Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Sat, 17 Jun 2023 16:52:43 -0700
Subject: [PATCH 060/110] Add C.EBREAK, C.JALR, and C.JR to overlap list

This isn't a functional change; we just failed to notate that
C.EBREAK and C.JALR overlap C.ADD, and C.JR overlaps C.MV.
---
 riscv/overlap_list.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/riscv/overlap_list.h b/riscv/overlap_list.h
index fc3b307743..a30c770e60 100644
--- a/riscv/overlap_list.h
+++ b/riscv/overlap_list.h
@@ -9,3 +9,6 @@ DECLARE_OVERLAP_INSN(cm_mva01s, EXT_ZCMP)
 DECLARE_OVERLAP_INSN(cm_mvsa01, EXT_ZCMP)
 DECLARE_OVERLAP_INSN(cm_jalt, EXT_ZCMT)
 DECLARE_OVERLAP_INSN(c_fsd, EXT_ZCD)
+DECLARE_OVERLAP_INSN(c_ebreak, EXT_ZCA)
+DECLARE_OVERLAP_INSN(c_jalr, EXT_ZCA)
+DECLARE_OVERLAP_INSN(c_jr, EXT_ZCA)

From 057cfbcca6dc6c65f1fd69b754e499ccabebe273 Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Sat, 17 Jun 2023 16:53:34 -0700
Subject: [PATCH 061/110] Add test that ensures opcodes don't overlap unless
 explicitly specified

---
 riscv/check-opcode-overlap.t.cc | 57 +++++++++++++++++++++++++++++++++
 riscv/riscv.mk.in               |  3 +-
 2 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 riscv/check-opcode-overlap.t.cc

diff --git a/riscv/check-opcode-overlap.t.cc b/riscv/check-opcode-overlap.t.cc
new file mode 100644
index 0000000000..2922001278
--- /dev/null
+++ b/riscv/check-opcode-overlap.t.cc
@@ -0,0 +1,57 @@
+#include "decode.h"
+#include "common.h"
+#include <unordered_set>
+#include <vector>
+#include <string>
+#include <cstdio>
+
+struct opcode {
+  insn_bits_t match;
+  insn_bits_t mask;
+  std::string name;
+};
+
+static void check_overlap(const opcode& a, const opcode& b)
+{
+  if ((a.match & b.mask) == b.match) {
+    fprintf(stderr, "Instruction %s (%" PRIx64 ") overlaps instruction %s (%" PRIx64 ", mask %" PRIx64 ")\n",
+            a.name.c_str(), a.match, b.name.c_str(), b.match, b.mask);
+    exit(-1);
+  }
+}
+
+int main()
+{
+  #define DECLARE_INSN(name, match, mask) \
+    const insn_bits_t UNUSED name##_match = (match), name##_mask = (mask);
+    #include "encoding.h"
+  #undef DECLARE_INSN
+
+  static const opcode static_list[] = {
+    #define DEFINE_INSN(name) \
+      {name##_match, name##_mask, #name},
+      #include "insn_list.h"
+    #undef DEFINE_INSN
+  };
+
+  std::unordered_set<std::string> overlap_list;
+  #define DECLARE_OVERLAP_INSN(name, ext) \
+    overlap_list.insert(std::string(#name));
+    #include "overlap_list.h"
+  #undef DECLARE_OVERLAP_INSN
+
+  std::vector<const opcode*> list;
+  for (size_t i = 0; i < sizeof(static_list) / sizeof(static_list[0]); i++) {
+    if (!overlap_list.count(static_list[i].name))
+      list.push_back(&static_list[i]);
+  }
+
+  for (size_t i = 1; i < list.size(); i++) {
+    for (size_t j = 0; j < i; j++) {
+      check_overlap(*list[i], *list[j]);
+      check_overlap(*list[j], *list[i]);
+    }
+  }
+
+  return 0;
+}
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 1cfe6275f0..ac45b2896a 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -75,7 +75,8 @@ riscv_srcs = \
 	cfg.cc \
 	$(riscv_gen_srcs) \
 
-riscv_test_srcs =
+riscv_test_srcs = \
+  check-opcode-overlap.t.cc \
 
 riscv_gen_hdrs = \
 	insn_list.h \

From 86b3e7851f6b0dacb17f61cd8ee77f8fc1a103b6 Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Sat, 17 Jun 2023 16:54:31 -0700
Subject: [PATCH 062/110] Run 'make check' in CI

---
 ci-tests/build-spike | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci-tests/build-spike b/ci-tests/build-spike
index 9c3fb373a3..058defdfc6 100755
--- a/ci-tests/build-spike
+++ b/ci-tests/build-spike
@@ -10,6 +10,7 @@ cd build
 mkdir install
 CXXFLAGS="-Wnon-virtual-dtor" CFLAGS="-Werror -Wignored-qualifiers -Wunused-function -Wunused-parameter -Wunused-variable" $DIR/../configure --prefix=`pwd`/install
 make -j"$(nproc 2> /dev/null || sysctl -n hw.ncpu)"
+make check
 make install
 
 # check that help message prints without error

From 3b6732458f5b6b47e87caad8cf9b03be13692f4f Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Sun, 18 Jun 2023 01:20:44 -0700
Subject: [PATCH 063/110] Add CMOV to overlap list, as it overlaps CZERO.EQZ

---
 riscv/overlap_list.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/riscv/overlap_list.h b/riscv/overlap_list.h
index a30c770e60..d8b1225866 100644
--- a/riscv/overlap_list.h
+++ b/riscv/overlap_list.h
@@ -12,3 +12,4 @@ DECLARE_OVERLAP_INSN(c_fsd, EXT_ZCD)
 DECLARE_OVERLAP_INSN(c_ebreak, EXT_ZCA)
 DECLARE_OVERLAP_INSN(c_jalr, EXT_ZCA)
 DECLARE_OVERLAP_INSN(c_jr, EXT_ZCA)
+DECLARE_OVERLAP_INSN(cmov, EXT_XZBT)

From fff2699cfdcf9a2fe7e004ca92c299a2c11c37a1 Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Sat, 17 Jun 2023 14:25:27 -0700
Subject: [PATCH 064/110] Remove instructions that belong only to Xbitmanip

---
 riscv/insns/bcompress.h    |  9 --------
 riscv/insns/bcompressw.h   | 10 ---------
 riscv/insns/bdecompress.h  |  9 --------
 riscv/insns/bdecompressw.h | 10 ---------
 riscv/insns/bfp.h          | 10 ---------
 riscv/insns/bfpw.h         |  9 --------
 riscv/insns/bmatflip.h     | 11 ----------
 riscv/insns/bmator.h       | 29 --------------------------
 riscv/insns/bmatxor.h      | 29 --------------------------
 riscv/insns/clmulhw.h      |  6 ------
 riscv/insns/clmulrw.h      |  6 ------
 riscv/insns/clmulw.h       |  6 ------
 riscv/insns/cmov.h         |  2 --
 riscv/insns/crc32_b.h      |  5 -----
 riscv/insns/crc32_d.h      |  6 ------
 riscv/insns/crc32_h.h      |  5 -----
 riscv/insns/crc32_w.h      |  5 -----
 riscv/insns/crc32c_b.h     |  5 -----
 riscv/insns/crc32c_d.h     |  6 ------
 riscv/insns/crc32c_h.h     |  5 -----
 riscv/insns/crc32c_w.h     |  5 -----
 riscv/insns/fsl.h          |  9 --------
 riscv/insns/fslw.h         | 10 ---------
 riscv/insns/fsriw.h        | 10 ---------
 riscv/insns/gorc.h         | 10 ---------
 riscv/insns/gorciw.h       | 11 ----------
 riscv/insns/gorcw.h        | 10 ---------
 riscv/insns/grev.h         | 10 ---------
 riscv/insns/greviw.h       | 11 ----------
 riscv/insns/grevw.h        | 10 ---------
 riscv/insns/packuw.h       |  5 -----
 riscv/insns/shfl.h         |  9 --------
 riscv/insns/shflw.h        |  9 --------
 riscv/insns/slo.h          |  2 --
 riscv/insns/sloi.h         |  3 ---
 riscv/insns/sloiw.h        |  3 ---
 riscv/insns/slow.h         |  3 ---
 riscv/insns/sro.h          |  2 --
 riscv/insns/sroi.h         |  3 ---
 riscv/insns/sroiw.h        |  3 ---
 riscv/insns/srow.h         |  3 ---
 riscv/insns/unshfl.h       |  9 --------
 riscv/insns/unshflw.h      |  9 --------
 riscv/insns/xperm16.h      |  2 --
 riscv/insns/xperm32.h      |  3 ---
 riscv/overlap_list.h       |  1 -
 riscv/riscv.mk.in          | 42 --------------------------------------
 47 files changed, 390 deletions(-)
 delete mode 100644 riscv/insns/bcompress.h
 delete mode 100644 riscv/insns/bcompressw.h
 delete mode 100644 riscv/insns/bdecompress.h
 delete mode 100644 riscv/insns/bdecompressw.h
 delete mode 100644 riscv/insns/bfp.h
 delete mode 100644 riscv/insns/bfpw.h
 delete mode 100644 riscv/insns/bmatflip.h
 delete mode 100644 riscv/insns/bmator.h
 delete mode 100644 riscv/insns/bmatxor.h
 delete mode 100644 riscv/insns/clmulhw.h
 delete mode 100644 riscv/insns/clmulrw.h
 delete mode 100644 riscv/insns/clmulw.h
 delete mode 100644 riscv/insns/cmov.h
 delete mode 100644 riscv/insns/crc32_b.h
 delete mode 100644 riscv/insns/crc32_d.h
 delete mode 100644 riscv/insns/crc32_h.h
 delete mode 100644 riscv/insns/crc32_w.h
 delete mode 100644 riscv/insns/crc32c_b.h
 delete mode 100644 riscv/insns/crc32c_d.h
 delete mode 100644 riscv/insns/crc32c_h.h
 delete mode 100644 riscv/insns/crc32c_w.h
 delete mode 100644 riscv/insns/fsl.h
 delete mode 100644 riscv/insns/fslw.h
 delete mode 100644 riscv/insns/fsriw.h
 delete mode 100644 riscv/insns/gorc.h
 delete mode 100644 riscv/insns/gorciw.h
 delete mode 100644 riscv/insns/gorcw.h
 delete mode 100644 riscv/insns/grev.h
 delete mode 100644 riscv/insns/greviw.h
 delete mode 100644 riscv/insns/grevw.h
 delete mode 100644 riscv/insns/packuw.h
 delete mode 100644 riscv/insns/shfl.h
 delete mode 100644 riscv/insns/shflw.h
 delete mode 100644 riscv/insns/slo.h
 delete mode 100644 riscv/insns/sloi.h
 delete mode 100644 riscv/insns/sloiw.h
 delete mode 100644 riscv/insns/slow.h
 delete mode 100644 riscv/insns/sro.h
 delete mode 100644 riscv/insns/sroi.h
 delete mode 100644 riscv/insns/sroiw.h
 delete mode 100644 riscv/insns/srow.h
 delete mode 100644 riscv/insns/unshfl.h
 delete mode 100644 riscv/insns/unshflw.h
 delete mode 100644 riscv/insns/xperm16.h
 delete mode 100644 riscv/insns/xperm32.h

diff --git a/riscv/insns/bcompress.h b/riscv/insns/bcompress.h
deleted file mode 100644
index 579346f463..0000000000
--- a/riscv/insns/bcompress.h
+++ /dev/null
@@ -1,9 +0,0 @@
-require_extension(EXT_XZBE);
-uint64_t c = 0, i = 0, data = zext_xlen(RS1), mask = zext_xlen(RS2);
-while (mask) {
-	uint64_t b = mask & ~((mask | (mask-1)) + 1);
-	c |= (data & b) >> (ctz(b) - i);
-	i += popcount(b);
-	mask -= b;
-}
-WRITE_RD(sext_xlen(c));
diff --git a/riscv/insns/bcompressw.h b/riscv/insns/bcompressw.h
deleted file mode 100644
index 2c1017cd17..0000000000
--- a/riscv/insns/bcompressw.h
+++ /dev/null
@@ -1,10 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBE);
-uint64_t c = 0, i = 0, data = zext32(RS1), mask = zext32(RS2);
-while (mask) {
-	uint64_t b = mask & ~((mask | (mask-1)) + 1);
-	c |= (data & b) >> (ctz(b) - i);
-	i += popcount(b);
-	mask -= b;
-}
-WRITE_RD(sext32(c));
diff --git a/riscv/insns/bdecompress.h b/riscv/insns/bdecompress.h
deleted file mode 100644
index 2894be0143..0000000000
--- a/riscv/insns/bdecompress.h
+++ /dev/null
@@ -1,9 +0,0 @@
-require_extension(EXT_XZBE);
-uint64_t c = 0, i = 0, data = zext_xlen(RS1), mask = zext_xlen(RS2);
-while (mask) {
-	uint64_t b = mask & ~((mask | (mask-1)) + 1);
-	c |= (data << (ctz(b) - i)) & b;
-	i += popcount(b);
-	mask -= b;
-}
-WRITE_RD(sext_xlen(c));
diff --git a/riscv/insns/bdecompressw.h b/riscv/insns/bdecompressw.h
deleted file mode 100644
index 468a7260ae..0000000000
--- a/riscv/insns/bdecompressw.h
+++ /dev/null
@@ -1,10 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBE);
-uint64_t c = 0, i = 0, data = zext32(RS1), mask = zext32(RS2);
-while (mask) {
-	uint64_t b = mask & ~((mask | (mask-1)) + 1);
-	c |= (data << (ctz(b) - i)) & b;
-	i += popcount(b);
-	mask -= b;
-}
-WRITE_RD(sext32(c));
diff --git a/riscv/insns/bfp.h b/riscv/insns/bfp.h
deleted file mode 100644
index 886d840531..0000000000
--- a/riscv/insns/bfp.h
+++ /dev/null
@@ -1,10 +0,0 @@
-require_extension(EXT_XZBF);
-reg_t cfg = RS2 >> (xlen/2);
-if ((cfg >> 30) == 2)
-	cfg = cfg >> 16;
-int len = (cfg >> 8) & (xlen/2-1);
-int off = cfg & (xlen-1);
-len = len ? len : xlen/2;
-reg_t mask = ~(~reg_t(0) << len) << off;
-reg_t data = RS2 << off;
-WRITE_RD(sext_xlen((data & mask) | (RS1 & ~mask)));
diff --git a/riscv/insns/bfpw.h b/riscv/insns/bfpw.h
deleted file mode 100644
index 42479e72f4..0000000000
--- a/riscv/insns/bfpw.h
+++ /dev/null
@@ -1,9 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBF);
-reg_t cfg = RS2 >> 16;
-int len = (cfg >> 8) & 15;
-int off = cfg & 31;
-len = len ? len : 16;
-reg_t mask = ~(~reg_t(0) << len) << off;
-reg_t data = RS2 << off;
-WRITE_RD(sext32((data & mask) | (RS1 & ~mask)));
diff --git a/riscv/insns/bmatflip.h b/riscv/insns/bmatflip.h
deleted file mode 100644
index c10df8f9a1..0000000000
--- a/riscv/insns/bmatflip.h
+++ /dev/null
@@ -1,11 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBM);
-reg_t x = RS1;
-for (int i = 0; i < 3; i++) {
-	x = (x & 0xFFFF00000000FFFFLL) | ((x & 0x0000FFFF00000000LL) >> 16) | ((x & 0x00000000FFFF0000LL) << 16);
-	x = (x & 0xFF0000FFFF0000FFLL) | ((x & 0x00FF000000FF0000LL) >>  8) | ((x & 0x0000FF000000FF00LL) <<  8);
-	x = (x & 0xF00FF00FF00FF00FLL) | ((x & 0x0F000F000F000F00LL) >>  4) | ((x & 0x00F000F000F000F0LL) <<  4);
-	x = (x & 0xC3C3C3C3C3C3C3C3LL) | ((x & 0x3030303030303030LL) >>  2) | ((x & 0x0C0C0C0C0C0C0C0CLL) <<  2);
-	x = (x & 0x9999999999999999LL) | ((x & 0x4444444444444444LL) >>  1) | ((x & 0x2222222222222222LL) <<  1);
-}
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/bmator.h b/riscv/insns/bmator.h
deleted file mode 100644
index 33057ca04e..0000000000
--- a/riscv/insns/bmator.h
+++ /dev/null
@@ -1,29 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBM);
-
-// transpose of rs2
-int64_t rs2t = RS2;
-for (int i = 0; i < 3; i++) {
-  rs2t = (rs2t & 0xFFFF00000000FFFFLL) | ((rs2t & 0x0000FFFF00000000LL) >> 16) | ((rs2t & 0x00000000FFFF0000LL) << 16);
-  rs2t = (rs2t & 0xFF0000FFFF0000FFLL) | ((rs2t & 0x00FF000000FF0000LL) >>  8) | ((rs2t & 0x0000FF000000FF00LL) <<  8);
-  rs2t = (rs2t & 0xF00FF00FF00FF00FLL) | ((rs2t & 0x0F000F000F000F00LL) >>  4) | ((rs2t & 0x00F000F000F000F0LL) <<  4);
-  rs2t = (rs2t & 0xC3C3C3C3C3C3C3C3LL) | ((rs2t & 0x3030303030303030LL) >>  2) | ((rs2t & 0x0C0C0C0C0C0C0C0CLL) <<  2);
-  rs2t = (rs2t & 0x9999999999999999LL) | ((rs2t & 0x4444444444444444LL) >>  1) | ((rs2t & 0x2222222222222222LL) <<  1);
-}
-
-int64_t rs1 = RS1;
-uint8_t u[8]; // rows of rs1
-uint8_t v[8]; // cols of rs2
-
-for (int i = 0; i < 8; i++) {
-  u[i] = rs1 >> (i*8);
-  v[i] = rs2t >> (i*8);
-}
-
-uint64_t x = 0;
-for (int i = 0; i < 64; i++) {
-  if ((u[i / 8] & v[i % 8]) != 0)
-    x |= 1LL << i;
-}
-
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/bmatxor.h b/riscv/insns/bmatxor.h
deleted file mode 100644
index ca2d096715..0000000000
--- a/riscv/insns/bmatxor.h
+++ /dev/null
@@ -1,29 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBM);
-
-// transpose of rs2
-int64_t rs2t = RS2;
-for (int i = 0; i < 3; i++) {
-  rs2t = (rs2t & 0xFFFF00000000FFFFLL) | ((rs2t & 0x0000FFFF00000000LL) >> 16) | ((rs2t & 0x00000000FFFF0000LL) << 16);
-  rs2t = (rs2t & 0xFF0000FFFF0000FFLL) | ((rs2t & 0x00FF000000FF0000LL) >>  8) | ((rs2t & 0x0000FF000000FF00LL) <<  8);
-  rs2t = (rs2t & 0xF00FF00FF00FF00FLL) | ((rs2t & 0x0F000F000F000F00LL) >>  4) | ((rs2t & 0x00F000F000F000F0LL) <<  4);
-  rs2t = (rs2t & 0xC3C3C3C3C3C3C3C3LL) | ((rs2t & 0x3030303030303030LL) >>  2) | ((rs2t & 0x0C0C0C0C0C0C0C0CLL) <<  2);
-  rs2t = (rs2t & 0x9999999999999999LL) | ((rs2t & 0x4444444444444444LL) >>  1) | ((rs2t & 0x2222222222222222LL) <<  1);
-}
-
-int64_t rs1 = RS1;
-uint8_t u[8]; // rows of rs1
-uint8_t v[8]; // cols of rs2
-
-for (int i = 0; i < 8; i++) {
-  u[i] = rs1 >> (i*8);
-  v[i] = rs2t >> (i*8);
-}
-
-uint64_t x = 0;
-for (int i = 0; i < 64; i++) {
-  if (popcount(u[i / 8] & v[i % 8]) & 1)
-    x |= 1LL << i;
-}
-
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/clmulhw.h b/riscv/insns/clmulhw.h
deleted file mode 100644
index f41acb0e83..0000000000
--- a/riscv/insns/clmulhw.h
+++ /dev/null
@@ -1,6 +0,0 @@
-require_extension(EXT_XZBC);
-reg_t a = zext32(RS1), b = zext32(RS2), x = 0;
-for (int i = 1; i < 32; i++)
-  if ((b >> i) & 1)
-    x ^= a >> (32-i);
-WRITE_RD(sext32(x));
diff --git a/riscv/insns/clmulrw.h b/riscv/insns/clmulrw.h
deleted file mode 100644
index 784859ae74..0000000000
--- a/riscv/insns/clmulrw.h
+++ /dev/null
@@ -1,6 +0,0 @@
-require_extension(EXT_XZBC);
-reg_t a = zext32(RS1), b = zext32(RS2), x = 0;
-for (int i = 0; i < 32; i++)
-  if ((b >> i) & 1)
-    x ^= a >> (31-i);
-WRITE_RD(sext32(x));
diff --git a/riscv/insns/clmulw.h b/riscv/insns/clmulw.h
deleted file mode 100644
index 5bb753fe3d..0000000000
--- a/riscv/insns/clmulw.h
+++ /dev/null
@@ -1,6 +0,0 @@
-require_extension(EXT_XZBC);
-reg_t a = zext32(RS1), b = zext32(RS2), x = 0;
-for (int i = 0; i < 32; i++)
-  if ((b >> i) & 1)
-    x ^= a << i;
-WRITE_RD(sext32(x));
diff --git a/riscv/insns/cmov.h b/riscv/insns/cmov.h
deleted file mode 100644
index c7551bc645..0000000000
--- a/riscv/insns/cmov.h
+++ /dev/null
@@ -1,2 +0,0 @@
-require_extension(EXT_XZBT);
-WRITE_RD(RS2 ? RS1 : RS3);
diff --git a/riscv/insns/crc32_b.h b/riscv/insns/crc32_b.h
deleted file mode 100644
index 3111fe5728..0000000000
--- a/riscv/insns/crc32_b.h
+++ /dev/null
@@ -1,5 +0,0 @@
-require_extension(EXT_XZBR);
-reg_t x = zext_xlen(RS1);
-for (int i = 0; i < 8; i++)
-  x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/crc32_d.h b/riscv/insns/crc32_d.h
deleted file mode 100644
index 7fd7a38f2b..0000000000
--- a/riscv/insns/crc32_d.h
+++ /dev/null
@@ -1,6 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBR);
-reg_t x = zext_xlen(RS1);
-for (int i = 0; i < 64; i++)
-  x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/crc32_h.h b/riscv/insns/crc32_h.h
deleted file mode 100644
index 5063fefd6d..0000000000
--- a/riscv/insns/crc32_h.h
+++ /dev/null
@@ -1,5 +0,0 @@
-require_extension(EXT_XZBR);
-reg_t x = zext_xlen(RS1);
-for (int i = 0; i < 16; i++)
-  x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/crc32_w.h b/riscv/insns/crc32_w.h
deleted file mode 100644
index 6e425ab8d9..0000000000
--- a/riscv/insns/crc32_w.h
+++ /dev/null
@@ -1,5 +0,0 @@
-require_extension(EXT_XZBR);
-reg_t x = zext_xlen(RS1);
-for (int i = 0; i < 32; i++)
-  x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/crc32c_b.h b/riscv/insns/crc32c_b.h
deleted file mode 100644
index d11b0dda87..0000000000
--- a/riscv/insns/crc32c_b.h
+++ /dev/null
@@ -1,5 +0,0 @@
-require_extension(EXT_XZBR);
-reg_t x = zext_xlen(RS1);
-for (int i = 0; i < 8; i++)
-  x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/crc32c_d.h b/riscv/insns/crc32c_d.h
deleted file mode 100644
index 81175fd9c1..0000000000
--- a/riscv/insns/crc32c_d.h
+++ /dev/null
@@ -1,6 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBR);
-reg_t x = zext_xlen(RS1);
-for (int i = 0; i < 64; i++)
-  x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/crc32c_h.h b/riscv/insns/crc32c_h.h
deleted file mode 100644
index ef5817d99e..0000000000
--- a/riscv/insns/crc32c_h.h
+++ /dev/null
@@ -1,5 +0,0 @@
-require_extension(EXT_XZBR);
-reg_t x = zext_xlen(RS1);
-for (int i = 0; i < 16; i++)
-  x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/crc32c_w.h b/riscv/insns/crc32c_w.h
deleted file mode 100644
index 8793540297..0000000000
--- a/riscv/insns/crc32c_w.h
+++ /dev/null
@@ -1,5 +0,0 @@
-require_extension(EXT_XZBR);
-reg_t x = zext_xlen(RS1);
-for (int i = 0; i < 32; i++)
-  x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/fsl.h b/riscv/insns/fsl.h
deleted file mode 100644
index 53a21608d9..0000000000
--- a/riscv/insns/fsl.h
+++ /dev/null
@@ -1,9 +0,0 @@
-require_extension(EXT_XZBT);
-int shamt = RS2 & (2*xlen-1);
-reg_t a = RS1, b = RS3;
-if (shamt >= xlen) {
-	a = RS3, b = RS1;
-	shamt -= xlen;
-}
-int rshamt = -shamt & (xlen-1);
-WRITE_RD(sext_xlen(shamt ? (a << shamt) | (zext_xlen(b) >> rshamt) : a));
diff --git a/riscv/insns/fslw.h b/riscv/insns/fslw.h
deleted file mode 100644
index 83940105b8..0000000000
--- a/riscv/insns/fslw.h
+++ /dev/null
@@ -1,10 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBT);
-int shamt = RS2 & 63;
-reg_t a = RS1, b = RS3;
-if (shamt >= 32) {
-	a = RS3, b = RS1;
-	shamt -= 32;
-}
-int rshamt = -shamt & 31;
-WRITE_RD(sext32(shamt ? (a << shamt) | (zext32(b) >> rshamt) : a));
diff --git a/riscv/insns/fsriw.h b/riscv/insns/fsriw.h
deleted file mode 100644
index 7956de7ce7..0000000000
--- a/riscv/insns/fsriw.h
+++ /dev/null
@@ -1,10 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBT);
-int shamt = SHAMT & 63;
-reg_t a = RS1, b = RS3;
-if (shamt >= 32) {
-	a = RS3, b = RS1;
-	shamt -= 32;
-}
-int rshamt = -shamt & 31;
-WRITE_RD(sext32(shamt ? (b << rshamt) | (zext32(a) >> shamt) : a));
diff --git a/riscv/insns/gorc.h b/riscv/insns/gorc.h
deleted file mode 100644
index ffe441347d..0000000000
--- a/riscv/insns/gorc.h
+++ /dev/null
@@ -1,10 +0,0 @@
-require_extension(EXT_XZBP);
-reg_t x = RS1;
-int shamt = RS2 & (xlen-1);
-if (shamt &  1) x |= ((x & 0x5555555555555555LL) <<  1) | ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
-if (shamt &  2) x |= ((x & 0x3333333333333333LL) <<  2) | ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
-if (shamt &  4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) <<  4) | ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
-if (shamt &  8) x |= ((x & 0x00FF00FF00FF00FFLL) <<  8) | ((x & 0xFF00FF00FF00FF00LL) >>  8);
-if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) | ((x & 0xFFFF0000FFFF0000LL) >> 16);
-if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) | ((x & 0xFFFFFFFF00000000LL) >> 32);
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/gorciw.h b/riscv/insns/gorciw.h
deleted file mode 100644
index 44ade807ea..0000000000
--- a/riscv/insns/gorciw.h
+++ /dev/null
@@ -1,11 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBP);
-require(SHAMT < 32);
-reg_t x = RS1;
-int shamt = SHAMT;
-if (shamt &  1) x |= ((x & 0x5555555555555555LL) <<  1) | ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
-if (shamt &  2) x |= ((x & 0x3333333333333333LL) <<  2) | ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
-if (shamt &  4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) <<  4) | ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
-if (shamt &  8) x |= ((x & 0x00FF00FF00FF00FFLL) <<  8) | ((x & 0xFF00FF00FF00FF00LL) >>  8);
-if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) | ((x & 0xFFFF0000FFFF0000LL) >> 16);
-WRITE_RD(sext32(x));
diff --git a/riscv/insns/gorcw.h b/riscv/insns/gorcw.h
deleted file mode 100644
index 611b3caa43..0000000000
--- a/riscv/insns/gorcw.h
+++ /dev/null
@@ -1,10 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBP);
-reg_t x = RS1;
-int shamt = RS2 & 31;
-if (shamt &  1) x |= ((x & 0x5555555555555555LL) <<  1) | ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
-if (shamt &  2) x |= ((x & 0x3333333333333333LL) <<  2) | ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
-if (shamt &  4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) <<  4) | ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
-if (shamt &  8) x |= ((x & 0x00FF00FF00FF00FFLL) <<  8) | ((x & 0xFF00FF00FF00FF00LL) >>  8);
-if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) | ((x & 0xFFFF0000FFFF0000LL) >> 16);
-WRITE_RD(sext32(x));
diff --git a/riscv/insns/grev.h b/riscv/insns/grev.h
deleted file mode 100644
index 7181b3cda8..0000000000
--- a/riscv/insns/grev.h
+++ /dev/null
@@ -1,10 +0,0 @@
-require_extension(EXT_XZBP);
-reg_t x = RS1;
-int shamt = RS2 & (xlen-1);
-if (shamt &  1) x = ((x & 0x5555555555555555LL) <<  1) | ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
-if (shamt &  2) x = ((x & 0x3333333333333333LL) <<  2) | ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
-if (shamt &  4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) <<  4) | ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
-if (shamt &  8) x = ((x & 0x00FF00FF00FF00FFLL) <<  8) | ((x & 0xFF00FF00FF00FF00LL) >>  8);
-if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) | ((x & 0xFFFF0000FFFF0000LL) >> 16);
-if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) | ((x & 0xFFFFFFFF00000000LL) >> 32);
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/greviw.h b/riscv/insns/greviw.h
deleted file mode 100644
index 004ecf347c..0000000000
--- a/riscv/insns/greviw.h
+++ /dev/null
@@ -1,11 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBP);
-require(SHAMT < 32);
-reg_t x = RS1;
-int shamt = SHAMT;
-if (shamt &  1) x = ((x & 0x5555555555555555LL) <<  1) | ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
-if (shamt &  2) x = ((x & 0x3333333333333333LL) <<  2) | ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
-if (shamt &  4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) <<  4) | ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
-if (shamt &  8) x = ((x & 0x00FF00FF00FF00FFLL) <<  8) | ((x & 0xFF00FF00FF00FF00LL) >>  8);
-if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) | ((x & 0xFFFF0000FFFF0000LL) >> 16);
-WRITE_RD(sext32(x));
diff --git a/riscv/insns/grevw.h b/riscv/insns/grevw.h
deleted file mode 100644
index 3fbcf228d2..0000000000
--- a/riscv/insns/grevw.h
+++ /dev/null
@@ -1,10 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBP);
-reg_t x = RS1;
-int shamt = RS2 & 31;
-if (shamt &  1) x = ((x & 0x5555555555555555LL) <<  1) | ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
-if (shamt &  2) x = ((x & 0x3333333333333333LL) <<  2) | ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
-if (shamt &  4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) <<  4) | ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
-if (shamt &  8) x = ((x & 0x00FF00FF00FF00FFLL) <<  8) | ((x & 0xFF00FF00FF00FF00LL) >>  8);
-if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) | ((x & 0xFFFF0000FFFF0000LL) >> 16);
-WRITE_RD(sext32(x));
diff --git a/riscv/insns/packuw.h b/riscv/insns/packuw.h
deleted file mode 100644
index 1b3f7d5f54..0000000000
--- a/riscv/insns/packuw.h
+++ /dev/null
@@ -1,5 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBP);
-reg_t lo = zext32(RS1) >> 16;
-reg_t hi = zext32(RS2) >> 16 << 16;
-WRITE_RD(sext32(lo | hi));
diff --git a/riscv/insns/shfl.h b/riscv/insns/shfl.h
deleted file mode 100644
index 3004871e2c..0000000000
--- a/riscv/insns/shfl.h
+++ /dev/null
@@ -1,9 +0,0 @@
-require_extension(EXT_XZBP);
-reg_t x = RS1;
-int shamt = RS2 & ((xlen-1) >> 1);
-if (shamt & 16) x = (x & 0xFFFF00000000FFFFLL) | ((x & 0x0000FFFF00000000LL) >> 16) | ((x & 0x00000000FFFF0000LL) << 16);
-if (shamt &  8) x = (x & 0xFF0000FFFF0000FFLL) | ((x & 0x00FF000000FF0000LL) >>  8) | ((x & 0x0000FF000000FF00LL) <<  8);
-if (shamt &  4) x = (x & 0xF00FF00FF00FF00FLL) | ((x & 0x0F000F000F000F00LL) >>  4) | ((x & 0x00F000F000F000F0LL) <<  4);
-if (shamt &  2) x = (x & 0xC3C3C3C3C3C3C3C3LL) | ((x & 0x3030303030303030LL) >>  2) | ((x & 0x0C0C0C0C0C0C0C0CLL) <<  2);
-if (shamt &  1) x = (x & 0x9999999999999999LL) | ((x & 0x4444444444444444LL) >>  1) | ((x & 0x2222222222222222LL) <<  1);
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/shflw.h b/riscv/insns/shflw.h
deleted file mode 100644
index 06ee36045e..0000000000
--- a/riscv/insns/shflw.h
+++ /dev/null
@@ -1,9 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBP);
-reg_t x = RS1;
-int shamt = RS2 & 15;
-if (shamt &  8) x = (x & 0xFF0000FFFF0000FFLL) | ((x & 0x00FF000000FF0000LL) >>  8) | ((x & 0x0000FF000000FF00LL) <<  8);
-if (shamt &  4) x = (x & 0xF00FF00FF00FF00FLL) | ((x & 0x0F000F000F000F00LL) >>  4) | ((x & 0x00F000F000F000F0LL) <<  4);
-if (shamt &  2) x = (x & 0xC3C3C3C3C3C3C3C3LL) | ((x & 0x3030303030303030LL) >>  2) | ((x & 0x0C0C0C0C0C0C0C0CLL) <<  2);
-if (shamt &  1) x = (x & 0x9999999999999999LL) | ((x & 0x4444444444444444LL) >>  1) | ((x & 0x2222222222222222LL) <<  1);
-WRITE_RD(sext32(x));
diff --git a/riscv/insns/slo.h b/riscv/insns/slo.h
deleted file mode 100644
index a27ec37e2e..0000000000
--- a/riscv/insns/slo.h
+++ /dev/null
@@ -1,2 +0,0 @@
-require_extension(EXT_XZBP);
-WRITE_RD(sext_xlen(~((~RS1) << (RS2 & (xlen-1)))));
diff --git a/riscv/insns/sloi.h b/riscv/insns/sloi.h
deleted file mode 100644
index 62278b030b..0000000000
--- a/riscv/insns/sloi.h
+++ /dev/null
@@ -1,3 +0,0 @@
-require(SHAMT < xlen);
-require_extension(EXT_XZBP);
-WRITE_RD(sext_xlen(~((~RS1) << SHAMT)));
diff --git a/riscv/insns/sloiw.h b/riscv/insns/sloiw.h
deleted file mode 100644
index 492c94a112..0000000000
--- a/riscv/insns/sloiw.h
+++ /dev/null
@@ -1,3 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBP);
-WRITE_RD(sext32(~((~RS1) << SHAMT)));
diff --git a/riscv/insns/slow.h b/riscv/insns/slow.h
deleted file mode 100644
index 04c90a45d4..0000000000
--- a/riscv/insns/slow.h
+++ /dev/null
@@ -1,3 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBP);
-WRITE_RD(sext32(~((~RS1) << (RS2 & 0x1F))));
diff --git a/riscv/insns/sro.h b/riscv/insns/sro.h
deleted file mode 100644
index 3ac050daff..0000000000
--- a/riscv/insns/sro.h
+++ /dev/null
@@ -1,2 +0,0 @@
-require_extension(EXT_XZBP);
-WRITE_RD(sext_xlen(~((zext_xlen(~RS1)) >> (RS2 & (xlen-1)))));
diff --git a/riscv/insns/sroi.h b/riscv/insns/sroi.h
deleted file mode 100644
index e878892800..0000000000
--- a/riscv/insns/sroi.h
+++ /dev/null
@@ -1,3 +0,0 @@
-require(SHAMT < xlen);
-require_extension(EXT_XZBP);
-WRITE_RD(sext_xlen(~((zext_xlen(~RS1)) >> SHAMT)));
diff --git a/riscv/insns/sroiw.h b/riscv/insns/sroiw.h
deleted file mode 100644
index 83480705fa..0000000000
--- a/riscv/insns/sroiw.h
+++ /dev/null
@@ -1,3 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBP);
-WRITE_RD(sext32(~((~(uint32_t)RS1) >> SHAMT)));
diff --git a/riscv/insns/srow.h b/riscv/insns/srow.h
deleted file mode 100644
index 808af8dbbe..0000000000
--- a/riscv/insns/srow.h
+++ /dev/null
@@ -1,3 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBP);
-WRITE_RD(sext32(~((~(uint32_t)RS1) >> (RS2 & 0x1F))));
diff --git a/riscv/insns/unshfl.h b/riscv/insns/unshfl.h
deleted file mode 100644
index 78990b876f..0000000000
--- a/riscv/insns/unshfl.h
+++ /dev/null
@@ -1,9 +0,0 @@
-require_extension(EXT_XZBP);
-reg_t x = RS1;
-int shamt = RS2 & ((xlen-1) >> 1);
-if (shamt &  1) x = (x & 0x9999999999999999LL) | ((x & 0x4444444444444444LL) >>  1) | ((x & 0x2222222222222222LL) <<  1);
-if (shamt &  2) x = (x & 0xC3C3C3C3C3C3C3C3LL) | ((x & 0x3030303030303030LL) >>  2) | ((x & 0x0C0C0C0C0C0C0C0CLL) <<  2);
-if (shamt &  4) x = (x & 0xF00FF00FF00FF00FLL) | ((x & 0x0F000F000F000F00LL) >>  4) | ((x & 0x00F000F000F000F0LL) <<  4);
-if (shamt &  8) x = (x & 0xFF0000FFFF0000FFLL) | ((x & 0x00FF000000FF0000LL) >>  8) | ((x & 0x0000FF000000FF00LL) <<  8);
-if (shamt & 16) x = (x & 0xFFFF00000000FFFFLL) | ((x & 0x0000FFFF00000000LL) >> 16) | ((x & 0x00000000FFFF0000LL) << 16);
-WRITE_RD(sext_xlen(x));
diff --git a/riscv/insns/unshflw.h b/riscv/insns/unshflw.h
deleted file mode 100644
index 776534e742..0000000000
--- a/riscv/insns/unshflw.h
+++ /dev/null
@@ -1,9 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBP);
-reg_t x = RS1;
-int shamt = RS2 & 15;
-if (shamt &  1) x = (x & 0x9999999999999999LL) | ((x & 0x4444444444444444LL) >>  1) | ((x & 0x2222222222222222LL) <<  1);
-if (shamt &  2) x = (x & 0xC3C3C3C3C3C3C3C3LL) | ((x & 0x3030303030303030LL) >>  2) | ((x & 0x0C0C0C0C0C0C0C0CLL) <<  2);
-if (shamt &  4) x = (x & 0xF00FF00FF00FF00FLL) | ((x & 0x0F000F000F000F00LL) >>  4) | ((x & 0x00F000F000F000F0LL) <<  4);
-if (shamt &  8) x = (x & 0xFF0000FFFF0000FFLL) | ((x & 0x00FF000000FF0000LL) >>  8) | ((x & 0x0000FF000000FF00LL) <<  8);
-WRITE_RD(sext32(x));
diff --git a/riscv/insns/xperm16.h b/riscv/insns/xperm16.h
deleted file mode 100644
index 6b0ad51f0e..0000000000
--- a/riscv/insns/xperm16.h
+++ /dev/null
@@ -1,2 +0,0 @@
-require_extension(EXT_XZBP);
-WRITE_RD(sext_xlen(xperm(RS1, RS2, 4, xlen)));
diff --git a/riscv/insns/xperm32.h b/riscv/insns/xperm32.h
deleted file mode 100644
index 64d90a406d..0000000000
--- a/riscv/insns/xperm32.h
+++ /dev/null
@@ -1,3 +0,0 @@
-require_rv64;
-require_extension(EXT_XZBP);
-WRITE_RD(xperm(RS1, RS2, 5, xlen));
diff --git a/riscv/overlap_list.h b/riscv/overlap_list.h
index d8b1225866..a30c770e60 100644
--- a/riscv/overlap_list.h
+++ b/riscv/overlap_list.h
@@ -12,4 +12,3 @@ DECLARE_OVERLAP_INSN(c_fsd, EXT_ZCD)
 DECLARE_OVERLAP_INSN(c_ebreak, EXT_ZCA)
 DECLARE_OVERLAP_INSN(c_jalr, EXT_ZCA)
 DECLARE_OVERLAP_INSN(c_jr, EXT_ZCA)
-DECLARE_OVERLAP_INSN(cmov, EXT_XZBT)
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index ac45b2896a..db63290205 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -393,15 +393,6 @@ riscv_insn_ext_q_zfa = \
 riscv_insn_ext_b = \
 	add_uw \
 	andn \
-	bdecompress \
-	bdecompressw \
-	bcompress \
-	bcompressw \
-	bfp \
-	bfpw \
-	bmatflip \
-	bmator \
-	bmatxor \
 	sh1add \
 	sh1add_uw \
 	sh2add \
@@ -414,31 +405,13 @@ riscv_insn_ext_b = \
 	clz \
 	clzw \
 	cmix \
-	cmov \
-	crc32_b \
-	crc32c_b \
-	crc32c_d \
-	crc32c_h \
-	crc32c_w \
-	crc32_d \
-	crc32_h \
-	crc32_w \
 	ctz \
 	ctzw \
-	fsl \
-	fslw \
 	fsr \
 	fsri \
-	fsriw \
 	fsrw \
-	gorc \
 	gorci \
-	gorciw \
-	gorcw \
-	grev \
 	grevi \
-	greviw \
-	grevw \
 	max \
 	maxu \
 	min \
@@ -447,7 +420,6 @@ riscv_insn_ext_b = \
 	pack \
 	packh \
 	packu \
-	packuw \
 	packw \
 	cpop \
 	cpopw \
@@ -467,26 +439,12 @@ riscv_insn_ext_b = \
 	bseti \
 	sext_b \
 	sext_h \
-	shfl \
 	shfli \
-	shflw \
 	slli_uw \
-	slo \
-	sloi \
-	sloiw \
-	slow \
-	sro \
-	sroi \
-	sroiw \
-	srow \
-	unshfl \
 	unshfli \
-	unshflw \
 	xnor \
 	xperm4 \
 	xperm8 \
-	xperm16 \
-	xperm32 \
 
 # Scalar Crypto ISE
 riscv_insn_ext_k = \

From b043cc1d7430d6a4d982aa4d2b07b44dd4b2366c Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Sat, 17 Jun 2023 14:33:42 -0700
Subject: [PATCH 065/110] Remove Xbitmanip from instructions that belong to
 multiple extensions

---
 riscv/insns/cmix.h    | 2 +-
 riscv/insns/fsr.h     | 3 ++-
 riscv/insns/fsri.h    | 3 ++-
 riscv/insns/fsrw.h    | 2 +-
 riscv/insns/gorci.h   | 3 +--
 riscv/insns/grevi.h   | 3 +--
 riscv/insns/pack.h    | 6 +-----
 riscv/insns/packh.h   | 5 +----
 riscv/insns/packu.h   | 4 +---
 riscv/insns/packw.h   | 5 +----
 riscv/insns/shfli.h   | 3 +--
 riscv/insns/unshfli.h | 3 +--
 riscv/insns/xperm4.h  | 2 +-
 riscv/insns/xperm8.h  | 2 +-
 14 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/riscv/insns/cmix.h b/riscv/insns/cmix.h
index 98eb0bca21..f3b79773a9 100644
--- a/riscv/insns/cmix.h
+++ b/riscv/insns/cmix.h
@@ -1,2 +1,2 @@
-require_either_extension(EXT_ZBPBO, EXT_XZBT);
+require_extension(EXT_ZBPBO);
 WRITE_RD((RS1 & RS2) | (RS3 & ~RS2));
diff --git a/riscv/insns/fsr.h b/riscv/insns/fsr.h
index dfb26f11e1..d94f922e96 100644
--- a/riscv/insns/fsr.h
+++ b/riscv/insns/fsr.h
@@ -1,4 +1,5 @@
-require_either_extension(xlen == 32 ? EXT_ZBPBO : EXT_XZBT, EXT_XZBT);
+require_rv32;
+require_extension(EXT_ZBPBO);
 int shamt = RS2 & (2*xlen-1);
 reg_t a = RS1, b = RS3;
 if (shamt >= xlen) {
diff --git a/riscv/insns/fsri.h b/riscv/insns/fsri.h
index f7186f1b6a..ced23642a9 100644
--- a/riscv/insns/fsri.h
+++ b/riscv/insns/fsri.h
@@ -1,4 +1,5 @@
-require_either_extension(xlen == 32 ? EXT_ZBPBO : EXT_XZBT, EXT_XZBT);
+require_rv32;
+require_extension(EXT_ZBPBO);
 int shamt = SHAMT & (2*xlen-1);
 reg_t a = RS1, b = RS3;
 if (shamt >= xlen) {
diff --git a/riscv/insns/fsrw.h b/riscv/insns/fsrw.h
index 494fe260cc..9471e36dcc 100644
--- a/riscv/insns/fsrw.h
+++ b/riscv/insns/fsrw.h
@@ -1,5 +1,5 @@
 require_rv64;
-require_either_extension(EXT_ZBPBO, EXT_XZBT);
+require_extension(EXT_ZBPBO);
 int shamt = RS2 & 63;
 reg_t a = RS1, b = RS3;
 if (shamt >= 32) {
diff --git a/riscv/insns/gorci.h b/riscv/insns/gorci.h
index d3017f499e..a4656faa03 100644
--- a/riscv/insns/gorci.h
+++ b/riscv/insns/gorci.h
@@ -1,6 +1,5 @@
 // Zbb contains orc.b but not general gorci
-require(((SHAMT == 7) && p->extension_enabled(EXT_ZBB))
-  || p->extension_enabled(EXT_XZBP));
+require(((SHAMT == 7) && p->extension_enabled(EXT_ZBB)));
 require(SHAMT < xlen);
 reg_t x = RS1;
 int shamt = SHAMT;
diff --git a/riscv/insns/grevi.h b/riscv/insns/grevi.h
index d4718145b4..c37f59b467 100644
--- a/riscv/insns/grevi.h
+++ b/riscv/insns/grevi.h
@@ -4,8 +4,7 @@ int shamt = SHAMT;
 require(((shamt == xlen - 8) && (p->extension_enabled(EXT_ZBB) || p->extension_enabled(EXT_ZBKB))) //rev8
   || ((shamt == 7) && p->extension_enabled(EXT_ZBKB)) // rev8.b
   || ((shamt == 8) && p->extension_enabled(EXT_ZPN)) // rev8.h
-  || ((shamt == xlen - 1) && p->extension_enabled(EXT_ZPN)) // rev
-  || p->extension_enabled(EXT_XZBP));
+  || ((shamt == xlen - 1) && p->extension_enabled(EXT_ZPN)));
 require(shamt < xlen);
 reg_t x = RS1;
 if (shamt &  1) x = ((x & 0x5555555555555555LL) <<  1) | ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
diff --git a/riscv/insns/pack.h b/riscv/insns/pack.h
index 2140f918d0..0622b92291 100644
--- a/riscv/insns/pack.h
+++ b/riscv/insns/pack.h
@@ -1,11 +1,7 @@
 // RV32Zbb contains zext.h but not general pack
 require(((xlen == 32) && (insn.rs2() == 0) && p->extension_enabled(EXT_ZBB))
   || p->extension_enabled(EXT_ZPN)
-  || p->extension_enabled(EXT_ZBKB)
-  || p->extension_enabled(EXT_XZBP)
-  || p->extension_enabled(EXT_XZBE)
-  || p->extension_enabled(EXT_XZBF)
-  || ((xlen == 64) && p->extension_enabled(EXT_XZBM)));
+  || p->extension_enabled(EXT_ZBKB));
 reg_t lo = zext_xlen(RS1 << (xlen/2)) >> (xlen/2);
 reg_t hi = zext_xlen(RS2 << (xlen/2));
 WRITE_RD(sext_xlen(lo | hi));
diff --git a/riscv/insns/packh.h b/riscv/insns/packh.h
index 82886e3293..0f3de5b974 100644
--- a/riscv/insns/packh.h
+++ b/riscv/insns/packh.h
@@ -1,7 +1,4 @@
-require(p->extension_enabled(EXT_ZBKB) ||
-        p->extension_enabled(EXT_XZBP) ||
-        p->extension_enabled(EXT_XZBE) ||
-        p->extension_enabled(EXT_XZBF));
+require_extension(EXT_ZBKB);
 reg_t lo = zext_xlen(RS1 << (xlen-8)) >> (xlen-8);
 reg_t hi = zext_xlen(RS2 << (xlen-8)) >> (xlen-16);
 WRITE_RD(sext_xlen(lo | hi));
diff --git a/riscv/insns/packu.h b/riscv/insns/packu.h
index 441207c32e..0676429f80 100644
--- a/riscv/insns/packu.h
+++ b/riscv/insns/packu.h
@@ -1,6 +1,4 @@
-require(p->extension_enabled(EXT_ZPN) ||
-        p->extension_enabled(EXT_XZBP) ||
-        ((xlen == 64) && p->extension_enabled(EXT_XZBM)));
+require_extension(EXT_ZPN);
 reg_t lo = zext_xlen(RS1) >> (xlen/2);
 reg_t hi = zext_xlen(RS2) >> (xlen/2) << (xlen/2);
 WRITE_RD(sext_xlen(lo | hi));
diff --git a/riscv/insns/packw.h b/riscv/insns/packw.h
index 084c190d0d..dd78717c05 100644
--- a/riscv/insns/packw.h
+++ b/riscv/insns/packw.h
@@ -1,9 +1,6 @@
 // RV64Zbb contains zext.h but not general packw
 require(((insn.rs2() == 0) && p->extension_enabled(EXT_ZBB))
-  || p->extension_enabled(EXT_ZBKB)
-  || p->extension_enabled(EXT_XZBP)
-  || p->extension_enabled(EXT_XZBE)
-  || p->extension_enabled(EXT_XZBF));
+  || p->extension_enabled(EXT_ZBKB));
 require_rv64;
 reg_t lo = zext32(RS1 << 16) >> 16;
 reg_t hi = zext32(RS2 << 16);
diff --git a/riscv/insns/shfli.h b/riscv/insns/shfli.h
index f8636190f0..bb21d2c9d0 100644
--- a/riscv/insns/shfli.h
+++ b/riscv/insns/shfli.h
@@ -1,6 +1,5 @@
 // Zbkb contains zip but not general shfli
-require(((insn.rs2() == (xlen / 2 - 1)) && p->extension_enabled(EXT_ZBKB))
-  || p->extension_enabled(EXT_XZBP));
+require(((insn.rs2() == (xlen / 2 - 1)) && p->extension_enabled(EXT_ZBKB)));
 require(SHAMT < (xlen/2));
 reg_t x = RS1;
 int shamt = SHAMT & ((xlen-1) >> 1);
diff --git a/riscv/insns/unshfli.h b/riscv/insns/unshfli.h
index 26920f1403..5a9cff1c45 100644
--- a/riscv/insns/unshfli.h
+++ b/riscv/insns/unshfli.h
@@ -1,6 +1,5 @@
 // Zbkb contains unzip but not general unshfli
-require(((insn.rs2() == (xlen / 2 - 1)) && p->extension_enabled(EXT_ZBKB))
-  || p->extension_enabled(EXT_XZBP));
+require(((insn.rs2() == (xlen / 2 - 1)) && p->extension_enabled(EXT_ZBKB)));
 require(SHAMT < (xlen/2));
 reg_t x = RS1;
 int shamt = SHAMT & ((xlen-1) >> 1);
diff --git a/riscv/insns/xperm4.h b/riscv/insns/xperm4.h
index 38800f3bfb..a9d685f3f6 100644
--- a/riscv/insns/xperm4.h
+++ b/riscv/insns/xperm4.h
@@ -1,2 +1,2 @@
-require_either_extension(EXT_ZBKX, EXT_XZBP);
+require_extension(EXT_ZBKX);
 WRITE_RD(sext_xlen(xperm(RS1, RS2, 2, xlen)));
diff --git a/riscv/insns/xperm8.h b/riscv/insns/xperm8.h
index c272d66949..1ba48efc6c 100644
--- a/riscv/insns/xperm8.h
+++ b/riscv/insns/xperm8.h
@@ -1,2 +1,2 @@
-require_either_extension(EXT_ZBKX, EXT_XZBP);
+require_extension(EXT_ZBKX);
 WRITE_RD(sext_xlen(xperm(RS1, RS2, 3, xlen)));

From 961d6def2131fa0a831083684fc77b83592a2175 Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Sat, 17 Jun 2023 14:34:22 -0700
Subject: [PATCH 066/110] Remove Xbitmanip from disassembler

---
 disasm/disasm.cc | 34 +---------------------------------
 1 file changed, 1 insertion(+), 33 deletions(-)

diff --git a/disasm/disasm.cc b/disasm/disasm.cc
index 25de783ae3..8722cdb977 100644
--- a/disasm/disasm.cc
+++ b/disasm/disasm.cc
@@ -2144,38 +2144,6 @@ void disassembler_t::add_instructions(const isa_parser_t* isa)
     }
   }
 
-  if (isa->extension_enabled(EXT_XZBP)) {
-    DEFINE_ITYPE_SHIFT(grevi);
-    DEFINE_ITYPE_SHIFT(gorci);
-    DEFINE_RTYPE(pack);
-    DEFINE_RTYPE(packh);
-    DEFINE_RTYPE(packu);
-    DEFINE_RTYPE(grev);
-    DEFINE_RTYPE(gorc);
-    DEFINE_RTYPE(xperm4);
-    DEFINE_RTYPE(xperm8);
-    DEFINE_RTYPE(xperm16);
-    DEFINE_RTYPE(xperm32);
-  }
-
-  if (isa->extension_enabled(EXT_XZBP) ||
-      isa->extension_enabled(EXT_XZBE) ||
-      isa->extension_enabled(EXT_XZBF)) {
-    if(isa->get_max_xlen() == 64) {
-      DEFINE_RTYPE(packw);
-    }
-  }
-
-  if (isa->extension_enabled(EXT_XZBT)) {
-    DEFINE_R3TYPE(cmix);
-    DEFINE_R3TYPE(fsr);
-    DEFINE_R3TYPE(fsri);
-    if(isa->get_max_xlen() == 64) {
-      DEFINE_R3TYPE(fsriw);
-      DEFINE_R3TYPE(fsrw);
-    }
-  }
-
   if (isa->extension_enabled(EXT_ZICBOM)) {
     DISASM_INSN("cbo.clean", cbo_clean, 0, {&base_only_address});
     DISASM_INSN("cbo.flush", cbo_flush, 0, {&base_only_address});
@@ -2252,7 +2220,7 @@ disassembler_t::disassembler_t(const isa_parser_t *isa)
 
   // next-highest priority: other instructions in same base ISA
   std::string fallback_isa_string = std::string("rv") + std::to_string(isa->get_max_xlen()) +
-    "gqchv_zfh_zba_zbb_zbc_zbs_zcb_zicbom_zicboz_zkn_zkr_zks_svinval_xbitmanip";
+    "gqchv_zfh_zba_zbb_zbc_zbs_zcb_zicbom_zicboz_zkn_zkr_zks_svinval";
   isa_parser_t fallback_isa(fallback_isa_string.c_str(), DEFAULT_PRIV);
   add_instructions(&fallback_isa);
 

From 69389df41cccc2853709e5a18f7c87693f4b0c3d Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Sat, 17 Jun 2023 14:24:37 -0700
Subject: [PATCH 067/110] isa parser: reject Xbitmanip extensions

---
 riscv/isa_parser.cc | 27 +--------------------------
 riscv/isa_parser.h  |  8 --------
 2 files changed, 1 insertion(+), 34 deletions(-)

diff --git a/riscv/isa_parser.cc b/riscv/isa_parser.cc
index bd73b0c39f..8bb8c495b3 100644
--- a/riscv/isa_parser.cc
+++ b/riscv/isa_parser.cc
@@ -242,32 +242,7 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
         extension_table[EXT_SSTC] = true;
     } else if (ext_str[0] == 'x') {
       extension_table['X'] = true;
-      if (ext_str == "xbitmanip") {
-        extension_table[EXT_XZBP] = true;
-        extension_table[EXT_XZBS] = true;
-        extension_table[EXT_XZBE] = true;
-        extension_table[EXT_XZBF] = true;
-        extension_table[EXT_XZBC] = true;
-        extension_table[EXT_XZBM] = true;
-        extension_table[EXT_XZBR] = true;
-        extension_table[EXT_XZBT] = true;
-      } else if (ext_str == "xzbp") {
-        extension_table[EXT_XZBP] = true;
-      } else if (ext_str == "xzbs") {
-        extension_table[EXT_XZBS] = true;
-      } else if (ext_str == "xzbe") {
-        extension_table[EXT_XZBE] = true;
-      } else if (ext_str == "xzbf") {
-        extension_table[EXT_XZBF] = true;
-      } else if (ext_str == "xzbc") {
-        extension_table[EXT_XZBC] = true;
-      } else if (ext_str == "xzbm") {
-        extension_table[EXT_XZBM] = true;
-      } else if (ext_str == "xzbr") {
-        extension_table[EXT_XZBR] = true;
-      } else if (ext_str == "xzbt") {
-        extension_table[EXT_XZBT] = true;
-      } else if (ext_str.size() == 1) {
+      if (ext_str.size() == 1) {
         bad_isa_string(str, "single 'X' is not a proper name");
       } else if (ext_str != "xdummy") {
          extension_t* x = find_extension(ext_str.substr(1).c_str())();
diff --git a/riscv/isa_parser.h b/riscv/isa_parser.h
index 7558116869..4e6856195c 100644
--- a/riscv/isa_parser.h
+++ b/riscv/isa_parser.h
@@ -60,14 +60,6 @@ typedef enum {
   EXT_ZIHPM,
   EXT_ZVFBFMIN,
   EXT_ZVFBFWMA,
-  EXT_XZBP,
-  EXT_XZBS,
-  EXT_XZBE,
-  EXT_XZBF,
-  EXT_XZBC,
-  EXT_XZBM,
-  EXT_XZBR,
-  EXT_XZBT,
   EXT_SSTC,
   EXT_INTERNAL_ZFH_MOVE,
   NUM_ISA_EXTENSIONS

From 58f9ba084c8943de29caa4503f734a6f752b1068 Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Sat, 17 Jun 2023 14:34:32 -0700
Subject: [PATCH 068/110] Remove Xbitmanip from README

---
 README.md | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/README.md b/README.md
index 8d5dc45b94..e850b6e459 100644
--- a/README.md
+++ b/README.md
@@ -53,20 +53,6 @@ Spike supports the following RISC-V ISA features:
   - Zvfbfmin extension, v0.6
   - Zvfbfwma extension, v0.6
 
-As a Spike extension, the remainder of the proposed
-[Bit-Manipulation Extensions](https://github.com/riscv/riscv-bitmanip)
-is provided under the Spike-custom extension name _Xbitmanip_.
-These instructions (and, of course, the extension name) are not RISC-V
-standards.
-
-These proposed bit-manipulation extensions can be split into further
-groups: Zbp, Zbs, Zbe, Zbf, Zbc, Zbm, Zbr, Zbt. Note that Zbc is
-ratified, but the original proposal contained some extra instructions
-(64-bit carryless multiplies) which are captured here.
-
-To enable these extensions individually, use the Spike-custom
-extension names _XZbp_, _XZbs_, _XZbc_, and so on.
-
 Versioning and APIs
 -------------------
 

From 270f408a7be7f574048e0431f172e13140d88045 Mon Sep 17 00:00:00 2001
From: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date: Sun, 18 Jun 2023 22:42:21 +0200
Subject: [PATCH 069/110] Makefile: fix type in check target

The check target processes the output using grep; however, one of the
patterns misspelled 'Segmenetation'.  Fixing the typo.
---
 Makefile.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.in b/Makefile.in
index 69f0405d7b..01d7baca41 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -319,7 +319,7 @@ $(2)_junk += \
 all-$(1) : lib$(1).a $$($(2)_install_prog_exes)
 
 check-$(1) : $$($(2)_test_outs)
-	echo; grep -h -e'Unit Tests' -e'FAILED' -e'Segementation' $$^; echo
+	echo; grep -h -e'Unit Tests' -e'FAILED' -e'Segmentation' $$^; echo
 
 clean-$(1) :
 	rm -rf $$($(2)_junk)

From 07e7626e5692ae6bb5773ddb5493ba838debca86 Mon Sep 17 00:00:00 2001
From: Gianluca Guida <gianluca@rivosinc.com>
Date: Thu, 25 May 2023 13:19:47 +0100
Subject: [PATCH 070/110] fesvr: support int128_t/uint128_t

Also remove now duplicate definition for types.
---
 fesvr/byteorder.h     | 7 +++++++
 riscv/decode_macros.h | 5 -----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/fesvr/byteorder.h b/fesvr/byteorder.h
index 2b1dbf981c..d9e503a271 100644
--- a/fesvr/byteorder.h
+++ b/fesvr/byteorder.h
@@ -15,6 +15,13 @@ static inline int16_t swap(int16_t n) { return int16_t(swap(uint16_t(n))); }
 static inline int32_t swap(int32_t n) { return int32_t(swap(uint32_t(n))); }
 static inline int64_t swap(int64_t n) { return int64_t(swap(uint64_t(n))); }
 
+#ifdef HAVE_INT128
+typedef __int128 int128_t;
+typedef unsigned __int128 uint128_t;
+static inline uint128_t swap(uint128_t n) { return (uint128_t(swap(uint64_t(n))) << 64) | swap(uint64_t(n >> 64)); }
+static inline int128_t swap(int128_t n) { return int128_t(swap(uint128_t(n))); }
+#endif
+
 #ifdef WORDS_BIGENDIAN
 template<typename T> static inline T from_be(T n) { return n; }
 template<typename T> static inline T to_be(T n) { return n; }
diff --git a/riscv/decode_macros.h b/riscv/decode_macros.h
index 7ba132c196..f39149b172 100644
--- a/riscv/decode_macros.h
+++ b/riscv/decode_macros.h
@@ -10,11 +10,6 @@
 #include "softfloat_types.h"
 #include "specialize.h"
 
-#ifdef HAVE_INT128
-typedef __int128 int128_t;
-typedef unsigned __int128 uint128_t;
-#endif
-
 // helpful macros, etc
 #define MMU (*p->get_mmu())
 #define STATE (*p->get_state())

From 4ac7e03dfb7a45d2732fadfe29e1af30ef25bcac Mon Sep 17 00:00:00 2001
From: Gianluca Guida <gianluca@rivosinc.com>
Date: Thu, 25 May 2023 13:27:45 +0100
Subject: [PATCH 071/110] mmu: support load/store longer than 64-bits.

---
 riscv/mmu.cc | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index 358ccd3e42..3f90060e82 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -242,6 +242,11 @@ void mmu_t::load_slow_path(reg_t addr, reg_t len, uint8_t* bytes, xlate_flags_t
       load_slow_path_intrapage(len - len_page0, bytes + len_page0, access_info.split_misaligned_access(len_page0));
   }
 
+  while (len > sizeof(reg_t)) {
+    check_triggers(triggers::OPERATION_LOAD, addr, access_info.effective_virt, reg_from_bytes(sizeof(reg_t), bytes));
+    len -= sizeof(reg_t);
+    bytes += sizeof(reg_t);
+  }
   check_triggers(triggers::OPERATION_LOAD, addr, access_info.effective_virt, reg_from_bytes(len, bytes));
 }
 
@@ -275,8 +280,16 @@ void mmu_t::store_slow_path_intrapage(reg_t len, const uint8_t* bytes, mem_acces
 void mmu_t::store_slow_path(reg_t addr, reg_t len, const uint8_t* bytes, xlate_flags_t xlate_flags, bool actually_store, bool UNUSED require_alignment)
 {
   auto access_info = generate_access_info(addr, STORE, xlate_flags);
-  if (actually_store)
-    check_triggers(triggers::OPERATION_STORE, addr, access_info.effective_virt, reg_from_bytes(len, bytes));
+  if (actually_store) {
+    reg_t trig_len = len;
+    const uint8_t* trig_bytes = bytes;
+    while (trig_len > sizeof(reg_t)) {
+      check_triggers(triggers::OPERATION_STORE, addr, access_info.effective_virt, reg_from_bytes(sizeof(reg_t), trig_bytes));
+      trig_len -= sizeof(reg_t);
+      trig_bytes += sizeof(reg_t);
+    }
+    check_triggers(triggers::OPERATION_STORE, addr, access_info.effective_virt, reg_from_bytes(trig_len, trig_bytes));
+  }
 
   if (addr & (len - 1)) {
     bool gva = access_info.effective_virt;

From bfdc0f8ef7598532c096b2293535fff70218f6a5 Mon Sep 17 00:00:00 2001
From: Gianluca Guida <gianluca@rivosinc.com>
Date: Mon, 19 Jun 2023 17:58:06 +0100
Subject: [PATCH 072/110] regenerate enconding.h

---
 riscv/encoding.h | 134 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 133 insertions(+), 1 deletion(-)

diff --git a/riscv/encoding.h b/riscv/encoding.h
index e39f535ceb..db7b0215b6 100644
--- a/riscv/encoding.h
+++ b/riscv/encoding.h
@@ -4,7 +4,7 @@
 
 /*
  * This file is auto-generated by running 'make' in
- * https://github.com/riscv/riscv-opcodes (8d70e77)
+ * https://github.com/riscv/riscv-opcodes (3ca60c5)
  */
 
 #ifndef RISCV_CSR_ENCODING_H
@@ -421,6 +421,12 @@
 #define MASK_AMOAND_D 0xf800707f
 #define MATCH_AMOAND_W 0x6000202f
 #define MASK_AMOAND_W 0xf800707f
+#define MATCH_AMOCAS_D 0x2800302f
+#define MASK_AMOCAS_D 0xf800707f
+#define MATCH_AMOCAS_Q 0x2800402f
+#define MASK_AMOCAS_Q 0xf800707f
+#define MATCH_AMOCAS_W 0x2800202f
+#define MASK_AMOCAS_W 0xf800707f
 #define MATCH_AMOMAX_D 0xa000302f
 #define MASK_AMOMAX_D 0xf800707f
 #define MATCH_AMOMAX_W 0xa000202f
@@ -1995,6 +2001,28 @@
 #define MASK_VADD_VV 0xfc00707f
 #define MATCH_VADD_VX 0x4057
 #define MASK_VADD_VX 0xfc00707f
+#define MATCH_VAESDF_VS 0xa600a077
+#define MASK_VAESDF_VS 0xfe0ff07f
+#define MATCH_VAESDF_VV 0xa200a077
+#define MASK_VAESDF_VV 0xfe0ff07f
+#define MATCH_VAESDM_VS 0xa6002077
+#define MASK_VAESDM_VS 0xfe0ff07f
+#define MATCH_VAESDM_VV 0xa2002077
+#define MASK_VAESDM_VV 0xfe0ff07f
+#define MATCH_VAESEF_VS 0xa601a077
+#define MASK_VAESEF_VS 0xfe0ff07f
+#define MATCH_VAESEF_VV 0xa201a077
+#define MASK_VAESEF_VV 0xfe0ff07f
+#define MATCH_VAESEM_VS 0xa6012077
+#define MASK_VAESEM_VS 0xfe0ff07f
+#define MATCH_VAESEM_VV 0xa2012077
+#define MASK_VAESEM_VV 0xfe0ff07f
+#define MATCH_VAESKF1_VI 0x8a002077
+#define MASK_VAESKF1_VI 0xfe00707f
+#define MATCH_VAESKF2_VI 0xaa002077
+#define MASK_VAESKF2_VI 0xfe00707f
+#define MATCH_VAESZ_VS 0xa603a077
+#define MASK_VAESZ_VS 0xfe0ff07f
 #define MATCH_VAMOADDEI16_V 0x502f
 #define MASK_VAMOADDEI16_V 0xf800707f
 #define MATCH_VAMOADDEI32_V 0x602f
@@ -2073,6 +2101,10 @@
 #define MASK_VAND_VV 0xfc00707f
 #define MATCH_VAND_VX 0x24004057
 #define MASK_VAND_VX 0xfc00707f
+#define MATCH_VANDN_VV 0x4000057
+#define MASK_VANDN_VV 0xfc00707f
+#define MATCH_VANDN_VX 0x4004057
+#define MASK_VANDN_VX 0xfc00707f
 #define MATCH_VASUB_VV 0x2c002057
 #define MASK_VASUB_VV 0xfc00707f
 #define MATCH_VASUB_VX 0x2c006057
@@ -2081,10 +2113,28 @@
 #define MASK_VASUBU_VV 0xfc00707f
 #define MATCH_VASUBU_VX 0x28006057
 #define MASK_VASUBU_VX 0xfc00707f
+#define MATCH_VBREV8_V 0x48042057
+#define MASK_VBREV8_V 0xfc0ff07f
+#define MATCH_VBREV_V 0x48052057
+#define MASK_VBREV_V 0xfc0ff07f
+#define MATCH_VCLMUL_VV 0x30002057
+#define MASK_VCLMUL_VV 0xfc00707f
+#define MATCH_VCLMUL_VX 0x30006057
+#define MASK_VCLMUL_VX 0xfc00707f
+#define MATCH_VCLMULH_VV 0x34002057
+#define MASK_VCLMULH_VV 0xfc00707f
+#define MATCH_VCLMULH_VX 0x34006057
+#define MASK_VCLMULH_VX 0xfc00707f
+#define MATCH_VCLZ_V 0x48062057
+#define MASK_VCLZ_V 0xfc0ff07f
 #define MATCH_VCOMPRESS_VM 0x5e002057
 #define MASK_VCOMPRESS_VM 0xfe00707f
 #define MATCH_VCPOP_M 0x40082057
 #define MASK_VCPOP_M 0xfc0ff07f
+#define MATCH_VCPOP_V 0x48072057
+#define MASK_VCPOP_V 0xfc0ff07f
+#define MATCH_VCTZ_V 0x4806a057
+#define MASK_VCTZ_V 0xfc0ff07f
 #define MATCH_VDIV_VV 0x84002057
 #define MASK_VDIV_VV 0xfc00707f
 #define MATCH_VDIV_VX 0x84006057
@@ -2285,6 +2335,10 @@
 #define MASK_VFWSUB_WF 0xfc00707f
 #define MATCH_VFWSUB_WV 0xd8001057
 #define MASK_VFWSUB_WV 0xfc00707f
+#define MATCH_VGHSH_VV 0xb2002077
+#define MASK_VGHSH_VV 0xfe00707f
+#define MATCH_VGMUL_VV 0xa208a077
+#define MASK_VGMUL_VV 0xfe0ff07f
 #define MATCH_VID_V 0x5008a057
 #define MASK_VID_V 0xfdfff07f
 #define MATCH_VIOTA_M 0x50082057
@@ -2631,6 +2685,8 @@
 #define MASK_VREMU_VV 0xfc00707f
 #define MATCH_VREMU_VX 0x88006057
 #define MASK_VREMU_VX 0xfc00707f
+#define MATCH_VREV8_V 0x4804a057
+#define MASK_VREV8_V 0xfc0ff07f
 #define MATCH_VRGATHER_VI 0x30003057
 #define MASK_VRGATHER_VI 0xfc00707f
 #define MATCH_VRGATHER_VV 0x30000057
@@ -2639,6 +2695,16 @@
 #define MASK_VRGATHER_VX 0xfc00707f
 #define MATCH_VRGATHEREI16_VV 0x38000057
 #define MASK_VRGATHEREI16_VV 0xfc00707f
+#define MATCH_VROL_VV 0x54000057
+#define MASK_VROL_VV 0xfc00707f
+#define MATCH_VROL_VX 0x54004057
+#define MASK_VROL_VX 0xfc00707f
+#define MATCH_VROR_VI 0x50003057
+#define MASK_VROR_VI 0xf800707f
+#define MATCH_VROR_VV 0x50000057
+#define MASK_VROR_VV 0xfc00707f
+#define MATCH_VROR_VX 0x50004057
+#define MASK_VROR_VX 0xfc00707f
 #define MATCH_VRSUB_VI 0xc003057
 #define MASK_VRSUB_VI 0xfc00707f
 #define MATCH_VRSUB_VX 0xc004057
@@ -2695,6 +2761,12 @@
 #define MASK_VSEXT_VF4 0xfc0ff07f
 #define MATCH_VSEXT_VF8 0x4801a057
 #define MASK_VSEXT_VF8 0xfc0ff07f
+#define MATCH_VSHA2CH_VV 0xba002077
+#define MASK_VSHA2CH_VV 0xfe00707f
+#define MATCH_VSHA2CL_VV 0xbe002077
+#define MASK_VSHA2CL_VV 0xfe00707f
+#define MATCH_VSHA2MS_VV 0xb6002077
+#define MASK_VSHA2MS_VV 0xfe00707f
 #define MATCH_VSLIDE1DOWN_VX 0x3c006057
 #define MASK_VSLIDE1DOWN_VX 0xfc00707f
 #define MATCH_VSLIDE1UP_VX 0x38006057
@@ -2713,6 +2785,16 @@
 #define MASK_VSLL_VV 0xfc00707f
 #define MATCH_VSLL_VX 0x94004057
 #define MASK_VSLL_VX 0xfc00707f
+#define MATCH_VSM3C_VI 0xae002077
+#define MASK_VSM3C_VI 0xfe00707f
+#define MATCH_VSM3ME_VV 0x82002077
+#define MASK_VSM3ME_VV 0xfe00707f
+#define MATCH_VSM4K_VI 0x86002077
+#define MASK_VSM4K_VI 0xfe00707f
+#define MATCH_VSM4R_VS 0xa6082077
+#define MASK_VSM4R_VS 0xfe0ff07f
+#define MATCH_VSM4R_VV 0xa2082077
+#define MASK_VSM4R_VV 0xfe0ff07f
 #define MATCH_VSM_V 0x2b00027
 #define MASK_VSM_V 0xfff0707f
 #define MATCH_VSMUL_VV 0x9c000057
@@ -2849,6 +2931,12 @@
 #define MASK_VWREDSUM_VS 0xfc00707f
 #define MATCH_VWREDSUMU_VS 0xc0000057
 #define MASK_VWREDSUMU_VS 0xfc00707f
+#define MATCH_VWSLL_VI 0xd4003057
+#define MASK_VWSLL_VI 0xfc00707f
+#define MATCH_VWSLL_VV 0xd4000057
+#define MASK_VWSLL_VV 0xfc00707f
+#define MATCH_VWSLL_VX 0xd4004057
+#define MASK_VWSLL_VX 0xfc00707f
 #define MATCH_VWSUB_VV 0xcc002057
 #define MASK_VWSUB_VV 0xfc00707f
 #define MATCH_VWSUB_VX 0xcc006057
@@ -3486,6 +3574,9 @@ DECLARE_INSN(amoadd_d, MATCH_AMOADD_D, MASK_AMOADD_D)
 DECLARE_INSN(amoadd_w, MATCH_AMOADD_W, MASK_AMOADD_W)
 DECLARE_INSN(amoand_d, MATCH_AMOAND_D, MASK_AMOAND_D)
 DECLARE_INSN(amoand_w, MATCH_AMOAND_W, MASK_AMOAND_W)
+DECLARE_INSN(amocas_d, MATCH_AMOCAS_D, MASK_AMOCAS_D)
+DECLARE_INSN(amocas_q, MATCH_AMOCAS_Q, MASK_AMOCAS_Q)
+DECLARE_INSN(amocas_w, MATCH_AMOCAS_W, MASK_AMOCAS_W)
 DECLARE_INSN(amomax_d, MATCH_AMOMAX_D, MASK_AMOMAX_D)
 DECLARE_INSN(amomax_w, MATCH_AMOMAX_W, MASK_AMOMAX_W)
 DECLARE_INSN(amomaxu_d, MATCH_AMOMAXU_D, MASK_AMOMAXU_D)
@@ -4273,6 +4364,17 @@ DECLARE_INSN(vadc_vxm, MATCH_VADC_VXM, MASK_VADC_VXM)
 DECLARE_INSN(vadd_vi, MATCH_VADD_VI, MASK_VADD_VI)
 DECLARE_INSN(vadd_vv, MATCH_VADD_VV, MASK_VADD_VV)
 DECLARE_INSN(vadd_vx, MATCH_VADD_VX, MASK_VADD_VX)
+DECLARE_INSN(vaesdf_vs, MATCH_VAESDF_VS, MASK_VAESDF_VS)
+DECLARE_INSN(vaesdf_vv, MATCH_VAESDF_VV, MASK_VAESDF_VV)
+DECLARE_INSN(vaesdm_vs, MATCH_VAESDM_VS, MASK_VAESDM_VS)
+DECLARE_INSN(vaesdm_vv, MATCH_VAESDM_VV, MASK_VAESDM_VV)
+DECLARE_INSN(vaesef_vs, MATCH_VAESEF_VS, MASK_VAESEF_VS)
+DECLARE_INSN(vaesef_vv, MATCH_VAESEF_VV, MASK_VAESEF_VV)
+DECLARE_INSN(vaesem_vs, MATCH_VAESEM_VS, MASK_VAESEM_VS)
+DECLARE_INSN(vaesem_vv, MATCH_VAESEM_VV, MASK_VAESEM_VV)
+DECLARE_INSN(vaeskf1_vi, MATCH_VAESKF1_VI, MASK_VAESKF1_VI)
+DECLARE_INSN(vaeskf2_vi, MATCH_VAESKF2_VI, MASK_VAESKF2_VI)
+DECLARE_INSN(vaesz_vs, MATCH_VAESZ_VS, MASK_VAESZ_VS)
 DECLARE_INSN(vamoaddei16_v, MATCH_VAMOADDEI16_V, MASK_VAMOADDEI16_V)
 DECLARE_INSN(vamoaddei32_v, MATCH_VAMOADDEI32_V, MASK_VAMOADDEI32_V)
 DECLARE_INSN(vamoaddei64_v, MATCH_VAMOADDEI64_V, MASK_VAMOADDEI64_V)
@@ -4312,12 +4414,23 @@ DECLARE_INSN(vamoxorei8_v, MATCH_VAMOXOREI8_V, MASK_VAMOXOREI8_V)
 DECLARE_INSN(vand_vi, MATCH_VAND_VI, MASK_VAND_VI)
 DECLARE_INSN(vand_vv, MATCH_VAND_VV, MASK_VAND_VV)
 DECLARE_INSN(vand_vx, MATCH_VAND_VX, MASK_VAND_VX)
+DECLARE_INSN(vandn_vv, MATCH_VANDN_VV, MASK_VANDN_VV)
+DECLARE_INSN(vandn_vx, MATCH_VANDN_VX, MASK_VANDN_VX)
 DECLARE_INSN(vasub_vv, MATCH_VASUB_VV, MASK_VASUB_VV)
 DECLARE_INSN(vasub_vx, MATCH_VASUB_VX, MASK_VASUB_VX)
 DECLARE_INSN(vasubu_vv, MATCH_VASUBU_VV, MASK_VASUBU_VV)
 DECLARE_INSN(vasubu_vx, MATCH_VASUBU_VX, MASK_VASUBU_VX)
+DECLARE_INSN(vbrev8_v, MATCH_VBREV8_V, MASK_VBREV8_V)
+DECLARE_INSN(vbrev_v, MATCH_VBREV_V, MASK_VBREV_V)
+DECLARE_INSN(vclmul_vv, MATCH_VCLMUL_VV, MASK_VCLMUL_VV)
+DECLARE_INSN(vclmul_vx, MATCH_VCLMUL_VX, MASK_VCLMUL_VX)
+DECLARE_INSN(vclmulh_vv, MATCH_VCLMULH_VV, MASK_VCLMULH_VV)
+DECLARE_INSN(vclmulh_vx, MATCH_VCLMULH_VX, MASK_VCLMULH_VX)
+DECLARE_INSN(vclz_v, MATCH_VCLZ_V, MASK_VCLZ_V)
 DECLARE_INSN(vcompress_vm, MATCH_VCOMPRESS_VM, MASK_VCOMPRESS_VM)
 DECLARE_INSN(vcpop_m, MATCH_VCPOP_M, MASK_VCPOP_M)
+DECLARE_INSN(vcpop_v, MATCH_VCPOP_V, MASK_VCPOP_V)
+DECLARE_INSN(vctz_v, MATCH_VCTZ_V, MASK_VCTZ_V)
 DECLARE_INSN(vdiv_vv, MATCH_VDIV_VV, MASK_VDIV_VV)
 DECLARE_INSN(vdiv_vx, MATCH_VDIV_VX, MASK_VDIV_VX)
 DECLARE_INSN(vdivu_vv, MATCH_VDIVU_VV, MASK_VDIVU_VV)
@@ -4418,6 +4531,8 @@ DECLARE_INSN(vfwsub_vf, MATCH_VFWSUB_VF, MASK_VFWSUB_VF)
 DECLARE_INSN(vfwsub_vv, MATCH_VFWSUB_VV, MASK_VFWSUB_VV)
 DECLARE_INSN(vfwsub_wf, MATCH_VFWSUB_WF, MASK_VFWSUB_WF)
 DECLARE_INSN(vfwsub_wv, MATCH_VFWSUB_WV, MASK_VFWSUB_WV)
+DECLARE_INSN(vghsh_vv, MATCH_VGHSH_VV, MASK_VGHSH_VV)
+DECLARE_INSN(vgmul_vv, MATCH_VGMUL_VV, MASK_VGMUL_VV)
 DECLARE_INSN(vid_v, MATCH_VID_V, MASK_VID_V)
 DECLARE_INSN(viota_m, MATCH_VIOTA_M, MASK_VIOTA_M)
 DECLARE_INSN(vl1re16_v, MATCH_VL1RE16_V, MASK_VL1RE16_V)
@@ -4591,10 +4706,16 @@ DECLARE_INSN(vrem_vv, MATCH_VREM_VV, MASK_VREM_VV)
 DECLARE_INSN(vrem_vx, MATCH_VREM_VX, MASK_VREM_VX)
 DECLARE_INSN(vremu_vv, MATCH_VREMU_VV, MASK_VREMU_VV)
 DECLARE_INSN(vremu_vx, MATCH_VREMU_VX, MASK_VREMU_VX)
+DECLARE_INSN(vrev8_v, MATCH_VREV8_V, MASK_VREV8_V)
 DECLARE_INSN(vrgather_vi, MATCH_VRGATHER_VI, MASK_VRGATHER_VI)
 DECLARE_INSN(vrgather_vv, MATCH_VRGATHER_VV, MASK_VRGATHER_VV)
 DECLARE_INSN(vrgather_vx, MATCH_VRGATHER_VX, MASK_VRGATHER_VX)
 DECLARE_INSN(vrgatherei16_vv, MATCH_VRGATHEREI16_VV, MASK_VRGATHEREI16_VV)
+DECLARE_INSN(vrol_vv, MATCH_VROL_VV, MASK_VROL_VV)
+DECLARE_INSN(vrol_vx, MATCH_VROL_VX, MASK_VROL_VX)
+DECLARE_INSN(vror_vi, MATCH_VROR_VI, MASK_VROR_VI)
+DECLARE_INSN(vror_vv, MATCH_VROR_VV, MASK_VROR_VV)
+DECLARE_INSN(vror_vx, MATCH_VROR_VX, MASK_VROR_VX)
 DECLARE_INSN(vrsub_vi, MATCH_VRSUB_VI, MASK_VRSUB_VI)
 DECLARE_INSN(vrsub_vx, MATCH_VRSUB_VX, MASK_VRSUB_VX)
 DECLARE_INSN(vs1r_v, MATCH_VS1R_V, MASK_VS1R_V)
@@ -4623,6 +4744,9 @@ DECLARE_INSN(vsetvli, MATCH_VSETVLI, MASK_VSETVLI)
 DECLARE_INSN(vsext_vf2, MATCH_VSEXT_VF2, MASK_VSEXT_VF2)
 DECLARE_INSN(vsext_vf4, MATCH_VSEXT_VF4, MASK_VSEXT_VF4)
 DECLARE_INSN(vsext_vf8, MATCH_VSEXT_VF8, MASK_VSEXT_VF8)
+DECLARE_INSN(vsha2ch_vv, MATCH_VSHA2CH_VV, MASK_VSHA2CH_VV)
+DECLARE_INSN(vsha2cl_vv, MATCH_VSHA2CL_VV, MASK_VSHA2CL_VV)
+DECLARE_INSN(vsha2ms_vv, MATCH_VSHA2MS_VV, MASK_VSHA2MS_VV)
 DECLARE_INSN(vslide1down_vx, MATCH_VSLIDE1DOWN_VX, MASK_VSLIDE1DOWN_VX)
 DECLARE_INSN(vslide1up_vx, MATCH_VSLIDE1UP_VX, MASK_VSLIDE1UP_VX)
 DECLARE_INSN(vslidedown_vi, MATCH_VSLIDEDOWN_VI, MASK_VSLIDEDOWN_VI)
@@ -4632,6 +4756,11 @@ DECLARE_INSN(vslideup_vx, MATCH_VSLIDEUP_VX, MASK_VSLIDEUP_VX)
 DECLARE_INSN(vsll_vi, MATCH_VSLL_VI, MASK_VSLL_VI)
 DECLARE_INSN(vsll_vv, MATCH_VSLL_VV, MASK_VSLL_VV)
 DECLARE_INSN(vsll_vx, MATCH_VSLL_VX, MASK_VSLL_VX)
+DECLARE_INSN(vsm3c_vi, MATCH_VSM3C_VI, MASK_VSM3C_VI)
+DECLARE_INSN(vsm3me_vv, MATCH_VSM3ME_VV, MASK_VSM3ME_VV)
+DECLARE_INSN(vsm4k_vi, MATCH_VSM4K_VI, MASK_VSM4K_VI)
+DECLARE_INSN(vsm4r_vs, MATCH_VSM4R_VS, MASK_VSM4R_VS)
+DECLARE_INSN(vsm4r_vv, MATCH_VSM4R_VV, MASK_VSM4R_VV)
 DECLARE_INSN(vsm_v, MATCH_VSM_V, MASK_VSM_V)
 DECLARE_INSN(vsmul_vv, MATCH_VSMUL_VV, MASK_VSMUL_VV)
 DECLARE_INSN(vsmul_vx, MATCH_VSMUL_VX, MASK_VSMUL_VX)
@@ -4700,6 +4829,9 @@ DECLARE_INSN(vwmulu_vv, MATCH_VWMULU_VV, MASK_VWMULU_VV)
 DECLARE_INSN(vwmulu_vx, MATCH_VWMULU_VX, MASK_VWMULU_VX)
 DECLARE_INSN(vwredsum_vs, MATCH_VWREDSUM_VS, MASK_VWREDSUM_VS)
 DECLARE_INSN(vwredsumu_vs, MATCH_VWREDSUMU_VS, MASK_VWREDSUMU_VS)
+DECLARE_INSN(vwsll_vi, MATCH_VWSLL_VI, MASK_VWSLL_VI)
+DECLARE_INSN(vwsll_vv, MATCH_VWSLL_VV, MASK_VWSLL_VV)
+DECLARE_INSN(vwsll_vx, MATCH_VWSLL_VX, MASK_VWSLL_VX)
 DECLARE_INSN(vwsub_vv, MATCH_VWSUB_VV, MASK_VWSUB_VV)
 DECLARE_INSN(vwsub_vx, MATCH_VWSUB_VX, MASK_VWSUB_VX)
 DECLARE_INSN(vwsub_wv, MATCH_VWSUB_WV, MASK_VWSUB_WV)

From f71bda9637366c2bb06612cf03ed126c628df678 Mon Sep 17 00:00:00 2001
From: Gianluca Guida <gianluca@rivosinc.com>
Date: Tue, 23 May 2023 13:47:07 +0100
Subject: [PATCH 073/110] Implement Zacas extension.

---
 disasm/disasm.cc       |  6 ++++++
 riscv/insns/amocas_d.h | 37 +++++++++++++++++++++++++++++++++++++
 riscv/insns/amocas_q.h | 34 ++++++++++++++++++++++++++++++++++
 riscv/insns/amocas_w.h |  2 ++
 riscv/isa_parser.cc    |  6 ++++++
 riscv/isa_parser.h     |  1 +
 riscv/mmu.h            | 11 +++++++++++
 riscv/processor.cc     |  5 +++++
 riscv/riscv.mk.in      |  6 ++++++
 9 files changed, 108 insertions(+)
 create mode 100644 riscv/insns/amocas_d.h
 create mode 100644 riscv/insns/amocas_q.h
 create mode 100644 riscv/insns/amocas_w.h

diff --git a/disasm/disasm.cc b/disasm/disasm.cc
index 8722cdb977..940fa66c16 100644
--- a/disasm/disasm.cc
+++ b/disasm/disasm.cc
@@ -815,6 +815,12 @@ void disassembler_t::add_instructions(const isa_parser_t* isa)
     DEFINE_XAMO(sc_d)
   }
 
+  if (isa->extension_enabled(EXT_ZACAS)) {
+    DEFINE_XAMO(amocas_w)
+    DEFINE_XAMO(amocas_d)
+    DEFINE_XAMO(amocas_q)
+  }
+
   add_insn(new disasm_insn_t("j", match_jal, mask_jal | mask_rd, {&jump_target}));
   add_insn(new disasm_insn_t("jal", match_jal | match_rd_ra, mask_jal | mask_rd, {&jump_target}));
   add_insn(new disasm_insn_t("jal", match_jal, mask_jal, {&xrd, &jump_target}));
diff --git a/riscv/insns/amocas_d.h b/riscv/insns/amocas_d.h
new file mode 100644
index 0000000000..e002e6ab75
--- /dev/null
+++ b/riscv/insns/amocas_d.h
@@ -0,0 +1,37 @@
+require_extension(EXT_ZACAS);
+
+if (xlen == 32) {
+  // RV32: the spec defines two 32-bit comparisons. Since we're
+  // loading 64-bit for memory we have to adjust for endianness.
+  uint64_t comp, swap, res;
+
+  require_align(insn.rd(), 2);
+  require_align(insn.rs2(), 2);
+  if (insn.rd() == 0) {
+    comp = 0;
+  } else if (MMU.is_target_big_endian()) {
+    comp = (uint32_t)READ_REG(insn.rd() + 1) | (RD << 32);
+  } else {
+    comp = (uint32_t)RD | (READ_REG(insn.rd() + 1) << 32);
+  }
+  if (insn.rs2() == 0) {
+    swap = 0;
+  } else if (MMU.is_target_big_endian()) {
+    swap = (uint32_t)READ_REG(insn.rs2() + 1) | (RS2 << 32);
+  } else {
+    swap = (uint32_t)RS2 | (READ_REG(insn.rs2() + 1) << 32);
+  }
+  res = MMU.amo_compare_and_swap<uint64_t>(RS1, comp, swap);
+  if (insn.rd() != 0) {
+    if (MMU.is_target_big_endian()) {
+      WRITE_REG(insn.rd() + 1, sext32((uint32_t)res));
+      WRITE_REG(insn.rd(), sext32(res >> 32));
+    } else {
+      WRITE_REG(insn.rd(), sext32((uint32_t)res));
+      WRITE_REG(insn.rd() + 1, sext32(res >> 32));
+    }
+  }
+ } else {
+  // RV64
+  WRITE_RD(MMU.amo_compare_and_swap<uint64_t>(RS1, RD, RS2));
+}
diff --git a/riscv/insns/amocas_q.h b/riscv/insns/amocas_q.h
new file mode 100644
index 0000000000..0b7593b3dc
--- /dev/null
+++ b/riscv/insns/amocas_q.h
@@ -0,0 +1,34 @@
+require_extension(EXT_ZACAS);
+require_rv64;
+require_align(insn.rd(), 2);
+require_align(insn.rs2(), 2);
+
+// The spec defines two 64-bit comparisons. Since we're loading
+// 128-bit for memory we have to adjust for endianness.
+
+uint128_t comp, swap, res;
+
+if (insn.rd() == 0) {
+  comp = 0;
+} else if (MMU.is_target_big_endian()) {
+  comp = READ_REG(insn.rd() + 1) | ((uint128_t)RD << 64);
+} else  {
+  comp = RD | ((uint128_t)READ_REG(insn.rd() + 1) << 64);
+}
+if (insn.rs2() == 0) {
+  swap = 0;
+} else if (MMU.is_target_big_endian()) {
+  swap = READ_REG(insn.rs2() + 1) | ((uint128_t)RS2 << 64);
+} else {
+  swap = RS2 | ((uint128_t)READ_REG(insn.rs2() + 1) << 64);
+}
+res = MMU.amo_compare_and_swap<uint128_t>(RS1, comp, swap);
+if (insn.rd() != 0) {
+  if (MMU.is_target_big_endian()) {
+    WRITE_REG(insn.rd(), res >> 64);
+    WRITE_REG(insn.rd() + 1, res);
+  } else {
+    WRITE_REG(insn.rd(), res);
+    WRITE_REG(insn.rd() + 1, res >> 64);
+  }
+}
diff --git a/riscv/insns/amocas_w.h b/riscv/insns/amocas_w.h
new file mode 100644
index 0000000000..a78c21cb73
--- /dev/null
+++ b/riscv/insns/amocas_w.h
@@ -0,0 +1,2 @@
+require_extension(EXT_ZACAS);
+WRITE_RD(sext32(MMU.amo_compare_and_swap<uint32_t>(RS1, RD, RS2)));
diff --git a/riscv/isa_parser.cc b/riscv/isa_parser.cc
index 8bb8c495b3..1c4300c958 100644
--- a/riscv/isa_parser.cc
+++ b/riscv/isa_parser.cc
@@ -120,6 +120,8 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
       // HINTs encoded in base-ISA instructions are always present.
     } else if (ext_str == "zihintntl") {
       // HINTs encoded in base-ISA instructions are always present.
+    } else if (ext_str == "zacas") {
+      extension_table[EXT_ZACAS] = true;
     } else if (ext_str == "zmmul") {
       extension_table[EXT_ZMMUL] = true;
     } else if (ext_str == "zba") {
@@ -301,6 +303,10 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
     bad_isa_string(str, "'Zcf/Zcd/Zcb/Zcmp/Zcmt' extensions require 'Zca' extension");
   }
 
+  if (extension_table[EXT_ZACAS] && !extension_table['A']) {
+    bad_isa_string(str, "'Zacas' extension requires 'A' extension");
+  }
+
   std::string lowercase = strtolower(priv);
   bool user = false, supervisor = false;
 
diff --git a/riscv/isa_parser.h b/riscv/isa_parser.h
index 4e6856195c..3cbee7dea0 100644
--- a/riscv/isa_parser.h
+++ b/riscv/isa_parser.h
@@ -61,6 +61,7 @@ typedef enum {
   EXT_ZVFBFMIN,
   EXT_ZVFBFWMA,
   EXT_SSTC,
+  EXT_ZACAS,
   EXT_INTERNAL_ZFH_MOVE,
   NUM_ISA_EXTENSIONS
 } isa_extension_t;
diff --git a/riscv/mmu.h b/riscv/mmu.h
index efc6e9de14..46c54ce88a 100644
--- a/riscv/mmu.h
+++ b/riscv/mmu.h
@@ -187,6 +187,17 @@ class mmu_t
     })
   }
 
+  template<typename T>
+  T amo_compare_and_swap(reg_t addr, T comp, T swap) {
+    convert_load_traps_to_store_traps({
+      store_slow_path(addr, sizeof(T), nullptr, {false, false, false}, false, true);
+      auto lhs = load<T>(addr);
+      if (lhs == comp)
+        store<T>(addr, swap);
+      return lhs;
+    })
+  }
+
   void store_float128(reg_t addr, float128_t val)
   {
     if (unlikely(addr & (sizeof(float128_t)-1)) && !is_misaligned_enabled()) {
diff --git a/riscv/processor.cc b/riscv/processor.cc
index a75b0ff6f1..1d5675a51a 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -47,6 +47,11 @@ processor_t::processor_t(const isa_parser_t *isa, const cfg_t *cfg,
     fprintf(stderr, "V extension is not supported on platforms without __int128 type\n");
     abort();
   }
+
+  if (isa->extension_enabled(EXT_ZACAS) && isa->get_max_xlen() == 64) {
+    fprintf(stderr, "Zacas extension is not supported on 64-bit platforms without __int128 type\n");
+    abort();
+  }
 #endif
 
   parse_varch_string(cfg->varch());
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index db63290205..6472982ed5 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1335,6 +1335,11 @@ riscv_insn_ext_bf16 = \
 	$(riscv_insn_ext_zvfbfmin) \
 	$(riscv_insn_ext_zvfbfwma) \
 
+riscv_insn_ext_zacas = \
+	amocas_w \
+	amocas_d \
+	$(if $(HAVE_INT128),amocas_q)
+
 riscv_insn_list = \
 	$(riscv_insn_ext_a) \
 	$(riscv_insn_ext_c) \
@@ -1360,6 +1365,7 @@ riscv_insn_list = \
 	$(riscv_insn_ext_cmo) \
 	$(riscv_insn_ext_zicond) \
 	$(riscv_insn_ext_bf16) \
+	$(riscv_insn_ext_zacas) \
 
 riscv_gen_srcs = $(addsuffix .cc,$(riscv_insn_list))
 

From d6d919ee5b061add58b159b6551235241744c91f Mon Sep 17 00:00:00 2001
From: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date: Sun, 18 Jun 2023 22:50:46 +0200
Subject: [PATCH 074/110] Add Zicond to disassembler

---
 disasm/disasm.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/disasm/disasm.cc b/disasm/disasm.cc
index 8722cdb977..783479981f 100644
--- a/disasm/disasm.cc
+++ b/disasm/disasm.cc
@@ -2154,6 +2154,11 @@ void disassembler_t::add_instructions(const isa_parser_t* isa)
     DISASM_INSN("cbo.zero", cbo_zero, 0, {&base_only_address});
   }
 
+  if (isa->extension_enabled(EXT_ZICOND)) {
+    DEFINE_RTYPE(czero_eqz);
+    DEFINE_RTYPE(czero_nez);
+  }
+
   if (isa->extension_enabled(EXT_ZKND) ||
       isa->extension_enabled(EXT_ZKNE)) {
     DISASM_INSN("aes64ks1i", aes64ks1i, 0, {&xrd, &xrs1, &rcon});

From 377fb0a11b8ccc28f7d1687523b7d79403e26453 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Wed, 31 May 2023 13:57:31 -0700
Subject: [PATCH 075/110] List extensions alphabetically in riscv_insn_list

The previous order lacks any obvious logic. Alphabetical order,
while making it difficult to create interesting groupings,
makes it easy to find which extensions are compiled in.

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/riscv.mk.in | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 6472982ed5..3b493a06c8 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1341,31 +1341,31 @@ riscv_insn_ext_zacas = \
 	$(if $(HAVE_INT128),amocas_q)
 
 riscv_insn_list = \
+	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
 	$(riscv_insn_ext_a) \
+	$(riscv_insn_ext_b) \
+	$(riscv_insn_ext_bf16) \
 	$(riscv_insn_ext_c) \
-	$(riscv_insn_ext_i) \
-	$(riscv_insn_ext_m) \
-	$(riscv_insn_ext_f) \
-	$(riscv_insn_ext_f_zfa) \
+	$(riscv_insn_ext_cmo) \
 	$(riscv_insn_ext_d) \
 	$(riscv_insn_ext_d_zfa) \
-	$(riscv_insn_ext_zfh) \
-	$(riscv_insn_ext_zfh_zfa) \
+	$(riscv_insn_ext_f) \
+	$(riscv_insn_ext_f_zfa) \
+	$(riscv_insn_ext_h) \
+	$(riscv_insn_ext_i) \
+	$(riscv_insn_ext_k) \
+	$(riscv_insn_ext_m) \
+	$(riscv_insn_ext_p) \
 	$(riscv_insn_ext_q) \
 	$(riscv_insn_ext_q_zfa) \
-	$(riscv_insn_ext_b) \
-	$(riscv_insn_ext_k) \
-	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
+	$(riscv_insn_ext_zacas) \
 	$(riscv_insn_ext_zce) \
-	$(riscv_insn_ext_h) \
-	$(riscv_insn_ext_p) \
+	$(riscv_insn_ext_zfh) \
+	$(riscv_insn_ext_zfh_zfa) \
+	$(riscv_insn_ext_zicond) \
 	$(riscv_insn_priv) \
-	$(riscv_insn_svinval) \
 	$(riscv_insn_smrnmi) \
-	$(riscv_insn_ext_cmo) \
-	$(riscv_insn_ext_zicond) \
-	$(riscv_insn_ext_bf16) \
-	$(riscv_insn_ext_zacas) \
+	$(riscv_insn_svinval) \
 
 riscv_gen_srcs = $(addsuffix .cc,$(riscv_insn_list))
 

From 1e5a71f99b3432ba9fb543995a466c2d96e96cec Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:04:31 -0700
Subject: [PATCH 076/110] Zvk: extensions parsing

Zvk is the short name for the Vector Cryptography Instruction
Set Extension Specification being defined at
<https://github.com/riscv/riscv-crypto/tree/master/doc/vector>.

This commit adds support for parsing/enabling the Zvk extensions
(Zvbb, Zvbc, Zvkg, Zvkned, Zvknha, Zvknhb, Zvksed, Zvksh, Zvkt)
and the combo extensions (Zvkn, Zvknc, Zvkng, Zvks, Zvksc, Zvksg).

This is an early commit in a series implementing Zvk. No instructions
are actually defined here, only infastructure that will
support the coming extensions.

The encodings for Zvk instructions have some conflicts with Zpn
encodings. This commit marks those Zpn instructions as overlapping,
and adds checks to error out if conflicting extensions are enabled.

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/isa_parser.cc  | 57 +++++++++++++++++++++++++++++++++++++++++++-
 riscv/isa_parser.h   | 16 +++++++++++++
 riscv/overlap_list.h |  9 +++++++
 3 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/riscv/isa_parser.cc b/riscv/isa_parser.cc
index 1c4300c958..6fb29aeb5a 100644
--- a/riscv/isa_parser.cc
+++ b/riscv/isa_parser.cc
@@ -236,10 +236,55 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
       extension_table[EXT_ZICOND] = true;
     } else if (ext_str == "zihpm") {
       extension_table[EXT_ZIHPM] = true;
+    } else if (ext_str == "zvbb") {
+      extension_table[EXT_ZVBB] = true;
+    } else if (ext_str == "zvbc") {
+      extension_table[EXT_ZVBC] = true;
     } else if (ext_str == "zvfbfmin") {
       extension_table[EXT_ZVFBFMIN] = true;
     } else if (ext_str == "zvfbfwma") {
       extension_table[EXT_ZVFBFWMA] = true;
+    } else if (ext_str == "zvkg") {
+      extension_table[EXT_ZVKG] = true;
+    } else if (ext_str == "zvkn") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVKNED] = true;
+      extension_table[EXT_ZVKNHB] = true;
+    } else if (ext_str == "zvknc") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVBC] = true;
+      extension_table[EXT_ZVKNED] = true;
+      extension_table[EXT_ZVKNHB] = true;
+    } else if (ext_str == "zvkng") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVKG] = true;
+      extension_table[EXT_ZVKNED] = true;
+      extension_table[EXT_ZVKNHB] = true;
+    } else if (ext_str == "zvkned") {
+      extension_table[EXT_ZVKNED] = true;
+    } else if (ext_str == "zvknha") {
+      extension_table[EXT_ZVKNHA] = true;
+    } else if (ext_str == "zvknhb") {
+      extension_table[EXT_ZVKNHB] = true;
+    } else if (ext_str == "zvks") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVKSED] = true;
+      extension_table[EXT_ZVKSH] = true;
+    } else if (ext_str == "zvksc") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVBC] = true;
+      extension_table[EXT_ZVKSED] = true;
+      extension_table[EXT_ZVKSH] = true;
+    } else if (ext_str == "zvksg") {
+      extension_table[EXT_ZVBB] = true;
+      extension_table[EXT_ZVKG] = true;
+      extension_table[EXT_ZVKSED] = true;
+      extension_table[EXT_ZVKSH] = true;
+    } else if (ext_str == "zvksed") {
+      extension_table[EXT_ZVKSED] = true;
+    } else if (ext_str == "zvksh") {
+      extension_table[EXT_ZVKSH] = true;
+    } else if (ext_str == "zvkt") {
     } else if (ext_str == "sstc") {
         extension_table[EXT_SSTC] = true;
     } else if (ext_str[0] == 'x') {
@@ -295,7 +340,7 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
   }
 
   if ((extension_table[EXT_ZCMP] || extension_table[EXT_ZCMT]) && extension_table[EXT_ZCD]) {
-    bad_isa_string(str, "Zcmp' and 'Zcmt' exensions are incompatible with 'Zcd' extension");
+    bad_isa_string(str, "Zcmp' and 'Zcmt' extensions are incompatible with 'Zcd' extension");
   }
 
   if ((extension_table[EXT_ZCF] || extension_table[EXT_ZCD] || extension_table[EXT_ZCB] ||
@@ -307,6 +352,16 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
     bad_isa_string(str, "'Zacas' extension requires 'A' extension");
   }
 
+  // Zpn conflicts with Zvknha/Zvknhb in both rv32 and rv64
+  if (extension_table[EXT_ZPN] && (extension_table[EXT_ZVKNHA] || extension_table[EXT_ZVKNHB])) {
+    bad_isa_string(str, "'Zvkna' and 'Zvknhb' extensions are incompatible with 'Zpn' extension");
+  }
+  // In rv64 only, Zpn (rv64_zpn) conflicts with Zvkg/Zvkned/Zvksh
+  if (max_xlen == 64 && extension_table[EXT_ZPN] &&
+      (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNED] || extension_table[EXT_ZVKSH])) {
+    bad_isa_string(str, "'Zvkg', 'Zvkned', and 'Zvksh' extensions are incompatible with 'Zpn' extension in rv64");
+  }
+
   std::string lowercase = strtolower(priv);
   bool user = false, supervisor = false;
 
diff --git a/riscv/isa_parser.h b/riscv/isa_parser.h
index 3cbee7dea0..5b04347520 100644
--- a/riscv/isa_parser.h
+++ b/riscv/isa_parser.h
@@ -58,8 +58,24 @@ typedef enum {
   EXT_ZICNTR,
   EXT_ZICOND,
   EXT_ZIHPM,
+  EXT_ZVBB,
+  EXT_ZVBC,
   EXT_ZVFBFMIN,
   EXT_ZVFBFWMA,
+  EXT_ZVKG,
+  EXT_ZVKNED,
+  EXT_ZVKNHA,
+  EXT_ZVKNHB,
+  EXT_ZVKSED,
+  EXT_ZVKSH,
+  EXT_XZBP,
+  EXT_XZBS,
+  EXT_XZBE,
+  EXT_XZBF,
+  EXT_XZBC,
+  EXT_XZBM,
+  EXT_XZBR,
+  EXT_XZBT,
   EXT_SSTC,
   EXT_ZACAS,
   EXT_INTERNAL_ZFH_MOVE,
diff --git a/riscv/overlap_list.h b/riscv/overlap_list.h
index a30c770e60..2214be4a58 100644
--- a/riscv/overlap_list.h
+++ b/riscv/overlap_list.h
@@ -12,3 +12,12 @@ DECLARE_OVERLAP_INSN(c_fsd, EXT_ZCD)
 DECLARE_OVERLAP_INSN(c_ebreak, EXT_ZCA)
 DECLARE_OVERLAP_INSN(c_jalr, EXT_ZCA)
 DECLARE_OVERLAP_INSN(c_jr, EXT_ZCA)
+DECLARE_OVERLAP_INSN(vaesdf_vv, EXT_ZVKNED)
+DECLARE_OVERLAP_INSN(vghsh_vv, EXT_ZVKG)
+DECLARE_OVERLAP_INSN(vsha2ms_vv, EXT_ZVKNHA)
+DECLARE_OVERLAP_INSN(vsha2ms_vv, EXT_ZVKNHB)
+DECLARE_OVERLAP_INSN(vsm3me_vv, EXT_ZVKSH)
+DECLARE_OVERLAP_INSN(rstsa16, EXT_ZPN)
+DECLARE_OVERLAP_INSN(rstsa32, EXT_ZPN)
+DECLARE_OVERLAP_INSN(srli32_u, EXT_ZPN)
+DECLARE_OVERLAP_INSN(umax32, EXT_ZPN)

From d5c0339484323b5a9498576d70ec90eab2e13438 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Sun, 18 Jun 2023 17:10:53 -0700
Subject: [PATCH 077/110] Zvk: Infrastructure for Zvk extensions, element group
 handling

Introduce types and macros useful across multiple Zvk sub-extensions,
including Zvbb and Zvbc. Those will be used by upcoming
per-sub-extension commits.

In particular we introduce "Element Group" types and loop macros handling
those element groups. The concept of element group is described in
<https://github.com/riscv/riscv-crypto/blob/master/doc/vector/riscv-crypto-vector-element-groups.adoc>.

Note that the element group access method is not implemented
for WORDS_BIGENDIAN setup. As such, isa_parser.cc is modified to emit
an error when WORDS_BIGENDIAN is defined and extensions using element
groups are enabled.

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/arith.h          |   21 +
 riscv/isa_parser.cc    |   10 +-
 riscv/v_ext_macros.h   |   22 +
 riscv/vector_unit.cc   |   55 +++
 riscv/vector_unit.h    |   19 +-
 riscv/zvk_ext_macros.h | 1023 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 1148 insertions(+), 2 deletions(-)
 create mode 100644 riscv/zvk_ext_macros.h

diff --git a/riscv/arith.h b/riscv/arith.h
index 3b807e9698..20b15047f7 100644
--- a/riscv/arith.h
+++ b/riscv/arith.h
@@ -7,6 +7,7 @@
 #include <cstdint>
 #include <climits>
 #include <cstddef>
+#include <type_traits>
 
 inline uint64_t mulhu(uint64_t a, uint64_t b)
 {
@@ -221,4 +222,24 @@ static inline uint64_t xperm(uint64_t rs1, uint64_t rs2, size_t sz_log2, size_t
   return r;
 }
 
+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_right(T x, std::size_t shiftamt) {
+  static_assert(std::is_unsigned<T>::value);
+  static constexpr T mask = (8 * sizeof(T)) - 1;
+  const std::size_t rshift = shiftamt & mask;
+  const std::size_t lshift = (-rshift) & mask;
+  return (x << lshift) | (x >> rshift);
+}
+
+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_left(T x, std::size_t shiftamt) {
+  static_assert(std::is_unsigned<T>::value);
+  static constexpr T mask = (8 * sizeof(T)) - 1;
+  const std::size_t lshift = shiftamt & mask;
+  const std::size_t rshift = (-lshift) & mask;
+  return (x << lshift) | (x >> rshift);
+}
+
 #endif
diff --git a/riscv/isa_parser.cc b/riscv/isa_parser.cc
index 6fb29aeb5a..59472a43f0 100644
--- a/riscv/isa_parser.cc
+++ b/riscv/isa_parser.cc
@@ -361,7 +361,15 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
       (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNED] || extension_table[EXT_ZVKSH])) {
     bad_isa_string(str, "'Zvkg', 'Zvkned', and 'Zvksh' extensions are incompatible with 'Zpn' extension in rv64");
   }
-
+#ifdef WORDS_BIGENDIAN
+  // Access to the vector registers as element groups is unimplemented on big-endian setups.
+  if (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNHA] || extension_table[EXT_ZVKNHB] ||
+      extension_table[EXT_ZVKSED] || extension_table[EXT_ZVKSH]) {
+      bad_isa_string(str,
+		     "'Zvkg', 'Zvkned', 'Zvknha', 'Zvknhb', 'Zvksed', and 'Zvksh' "
+		     "extensions are incompatible with WORDS_BIGENDIAN setups.");
+  }
+#endif
   std::string lowercase = strtolower(priv);
   bool user = false, supervisor = false;
 
diff --git a/riscv/v_ext_macros.h b/riscv/v_ext_macros.h
index 41256c7a59..908ff16c28 100644
--- a/riscv/v_ext_macros.h
+++ b/riscv/v_ext_macros.h
@@ -325,6 +325,10 @@ static inline bool is_overlapped_widen(const int astart, int asize,
   type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \
   type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
 
+#define V_U_PARAMS(x) \
+  type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
 #define VX_U_PARAMS(x) \
   type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
   type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \
@@ -693,6 +697,24 @@ static inline bool is_overlapped_widen(const int astart, int asize,
   } \
   VI_LOOP_END 
 
+#define VI_V_ULOOP(BODY) \
+  VI_CHECK_SSS(false) \
+  VI_LOOP_BASE \
+  if (sew == e8) { \
+    V_U_PARAMS(e8); \
+    BODY; \
+  } else if (sew == e16) { \
+    V_U_PARAMS(e16); \
+    BODY; \
+  } else if (sew == e32) { \
+    V_U_PARAMS(e32); \
+    BODY; \
+  } else if (sew == e64) { \
+    V_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
 #define VI_VX_ULOOP(BODY) \
   VI_CHECK_SSS(false) \
   VI_LOOP_BASE \
diff --git a/riscv/vector_unit.cc b/riscv/vector_unit.cc
index 9128df63ee..08adc6166d 100644
--- a/riscv/vector_unit.cc
+++ b/riscv/vector_unit.cc
@@ -86,6 +86,56 @@ template<class T> T& vectorUnit_t::elt(reg_t vReg, reg_t n, bool UNUSED is_write
   return regStart[n];
 }
 
+// The logic differences between 'elt()' and 'elt_group()' come from
+// the fact that, while 'elt()' requires that the element is fully
+// contained in a single vector register, the element group may span
+// multiple registers in a single register group (LMUL>1).
+//
+// Notes:
+// - We do NOT check that a single element - i.e., the T in the element
+//   group type std::array<T, N> - fits within a single register, or that
+//   T is smaller or equal to VSEW. Implementations of the instructions
+//   sometimes use a different T than what the specification suggests.
+//   Instructon implementations should 'require()' what the specification
+//   dictates.
+// - We do NOT check that 'vReg' is a valid register group, or that
+//   'n+1' element groups fit in the register group 'vReg'. It is
+//   the responsibility of the caller to validate those preconditions.
+template<typename EG> EG&
+vectorUnit_t::elt_group(reg_t vReg, reg_t n, bool UNUSED is_write) {
+#ifdef WORDS_BIGENDIAN
+  fputs("vectorUnit_t::elt_group is not compatible with WORDS_BIGENDIAN setup.\n",
+          stderr);
+  abort();
+#endif
+  using T = typename EG::value_type;
+  constexpr std::size_t N = std::tuple_size<EG>::value;
+  assert(N > 0);
+
+  assert(vsew != 0);
+  constexpr reg_t elt_group_size = N * sizeof(T);
+  const reg_t reg_group_size = (VLEN >> 3) * vflmul;
+  assert(((n + 1) * elt_group_size) <= reg_group_size);
+
+  const reg_t start_byte = n * elt_group_size;
+  const reg_t bytes_per_reg = VLEN >> 3;
+
+  // Inclusive first/last register indices.
+  const reg_t reg_first = vReg + start_byte / bytes_per_reg;
+  const reg_t reg_last = vReg + (start_byte + elt_group_size - 1) / bytes_per_reg;
+
+  // Element groups per register groups
+  for (reg_t vidx = reg_first; vidx <= reg_last; ++vidx) {
+      reg_referenced[vidx] = 1;
+
+      if (unlikely(p->get_log_commits_enabled() && is_write)) {
+          p->get_state()->log_reg_write[(vidx << 4) | 2] = {0, 0};
+      }
+  }
+
+  return *(EG*)((char*)reg_file + vReg * (VLEN >> 3) + start_byte);
+}
+
 template signed char& vectorUnit_t::elt<signed char>(reg_t, reg_t, bool);
 template short& vectorUnit_t::elt<short>(reg_t, reg_t, bool);
 template int& vectorUnit_t::elt<int>(reg_t, reg_t, bool);
@@ -98,3 +148,8 @@ template uint64_t& vectorUnit_t::elt<uint64_t>(reg_t, reg_t, bool);
 template float16_t& vectorUnit_t::elt<float16_t>(reg_t, reg_t, bool);
 template float32_t& vectorUnit_t::elt<float32_t>(reg_t, reg_t, bool);
 template float64_t& vectorUnit_t::elt<float64_t>(reg_t, reg_t, bool);
+
+template EGU32x4_t& vectorUnit_t::elt_group<EGU32x4_t>(reg_t, reg_t, bool);
+template EGU32x8_t& vectorUnit_t::elt_group<EGU32x8_t>(reg_t, reg_t, bool);
+template EGU64x4_t& vectorUnit_t::elt_group<EGU64x4_t>(reg_t, reg_t, bool);
+template EGU8x16_t& vectorUnit_t::elt_group<EGU8x16_t>(reg_t, reg_t, bool);
diff --git a/riscv/vector_unit.h b/riscv/vector_unit.h
index b9f706c53a..a057c62fbe 100644
--- a/riscv/vector_unit.h
+++ b/riscv/vector_unit.h
@@ -2,6 +2,9 @@
 #ifndef _RISCV_VECTOR_UNIT_H
 #define _RISCV_VECTOR_UNIT_H
 
+#include <array>
+#include <cstdint>
+
 #include "decode.h"
 #include "csrs.h"
 
@@ -69,6 +72,17 @@ struct type_sew_t<64>
   using type=int64_t;
 };
 
+// Element Group of 4 32 bits elements (128b total).
+using EGU32x4_t = std::array<uint32_t, 4>;
+
+// Element Group of 8 32 bits elements (256b total).
+using EGU32x8_t = std::array<uint32_t, 8>;
+
+// Element Group of 4 64 bits elements (256b total).
+using EGU64x4_t = std::array<uint64_t, 4>;
+
+// Element Group of 16 8 bits elements (128b total).
+using EGU8x16_t = std::array<uint8_t, 16>;
 
 class vectorUnit_t
 {
@@ -88,8 +102,11 @@ class vectorUnit_t
   bool vill;
   bool vstart_alu;
 
-  // vector element for varies SEW
+  // vector element for various SEW
   template<class T> T& elt(reg_t vReg, reg_t n, bool is_write = false);
+  // vector element group access, where EG is a std::array<T, N>.
+  template<typename EG> EG&
+  elt_group(reg_t vReg, reg_t n, bool is_write = false);
 
 public:
 
diff --git a/riscv/zvk_ext_macros.h b/riscv/zvk_ext_macros.h
new file mode 100644
index 0000000000..7efbac806f
--- /dev/null
+++ b/riscv/zvk_ext_macros.h
@@ -0,0 +1,1023 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvk extension (vector cryptography).
+
+// Note that a good deal of code here would be cleaner/simpler
+// if exposed as C++ functions (including templated ones), however
+// this is not possible in the contexts where those headers are
+// included.
+
+#ifndef RISCV_ZVK_EXT_MACROS_H_
+#define RISCV_ZVK_EXT_MACROS_H_
+
+//
+// Predicate Macros
+//
+
+// Ensures that the ZVBB extension (vector crypto bitmanip) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvbb \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVBB); \
+  } while (0)
+
+// Ensures that the ZVBC extension (vector carryless multiplication)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvbc \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVBC); \
+  } while (0)
+
+// Ensures that the ZVKG extension (vector Gallois Field Multiplication)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvkg \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKG); \
+  } while (0)
+
+// Ensures that a ZVK extension supporting SHA-256 is present.
+// For SHA-256, this support is present in either Zvknha or Zvknhb.
+// Also ensures that the vector unit is enabled and in a valid state.
+#define require_zvknh_256 \
+  do { \
+    require_vector(true); \
+    require_either_extension(EXT_ZVKNHA, EXT_ZVKNHB); \
+  } while (0)
+
+// Ensures that the ZVKNED extension (vector AES single round) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvkned \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKNED); \
+  } while (0)
+
+// Ensures that a ZVK extension supporting SHA-512 is present.
+// For SHA-512, this support is only present in Zvknhb.
+// Also ensures that the vector unit is enabled and in a valid state.
+#define require_zvknh_512 \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKNHB); \
+  } while (0)
+
+// Ensures that the ZVKSED extension (vector SM4 block cipher)
+// is present, and the vector unit is enabled and in a valid state.
+#define require_zvksed \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKSED); \
+  } while (0)
+
+// Ensures that the ZVKSH extension (vector SM3 hash) is present,
+// and the vector unit is enabled and in a valid state.
+#define require_zvksh \
+  do { \
+    require_vector(true); \
+    require_extension(EXT_ZVKSH); \
+  } while (0)
+
+// Ensures that the vector instruction is not using a mask.
+#define require_no_vmask  require(insn.v_vm() == 1)
+
+// Ensures that an element group can fit in a register group. That is,
+//    (LMUL * VLEN) <= EGW
+#define require_egw_fits(EGW)  require((EGW) <= (P.VU.VLEN * P.VU.vflmul))
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=32, EGS=4 (four 32-bits elements per group),
+// for an effective element group width of EGW=128 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_32x4 \
+  do { \
+    /* 'vstart' must be a multiple of EGS */ \
+    const reg_t vstart = P.VU.vstart->read(); \
+    require(vstart % 4 == 0); \
+    /* 'vl' must be a multiple of EGS */ \
+    const reg_t vl = P.VU.vl->read(); \
+    require(vl % 4 == 0); \
+  } while (0)
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=32, EGS=8 (eight 32-bits elements per group),
+// for an effective element group width of EGW=256 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_32x8 \
+  do { \
+    /* 'vstart' must be a multiple of EGS */ \
+    const reg_t vstart = P.VU.vstart->read(); \
+    require(vstart % 8 == 0); \
+    /* 'vl' must be a multiple of EGS */ \
+    const reg_t vl = P.VU.vl->read(); \
+    require(vl % 8 == 0); \
+  } while (0)
+
+// Checks that the vector unit state (vtype and vl) can be interpreted
+// as element groups with EEW=64, EGS=4 (four 64-bits elements per group),
+// for an effective element group width of EGW=128 bits.
+//
+// Per the vector crypto specification, SEW is ignored. 'vl' and 'vstart'
+// are interpreted as a number of EEW-wide elements. They must both
+// be multiples of EGS (potentially 0).
+#define require_element_groups_64x4 \
+  do { \
+    /* 'vstart' must be a multiple of EGS */ \
+    const reg_t vstart = P.VU.vstart->read(); \
+    require(vstart % 4 == 0); \
+    /* 'vl' must be a multiple of EGS */ \
+    const reg_t vl = P.VU.vl->read(); \
+    require(vl % 4 == 0); \
+  } while (0)
+
+//
+// Loop Parameters Macros
+//
+
+// Extracts a 32b*4 element group as a EGU32x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU32x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+  EGU32x4_t &vd = P.VU.elt_group<EGU32x4_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x4_t vs1 = P.VU.elt_group<EGU32x4_t>((VS1_NUM), (EG_IDX)); \
+  const EGU32x4_t vs2 = P.VU.elt_group<EGU32x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*8 element group as a EGU32x8_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU32x8_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+  EGU32x8_t &vd = P.VU.elt_group<EGU32x8_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x8_t vs1 = P.VU.elt_group<EGU32x8_t>((VS1_NUM), (EG_IDX)); \
+  const EGU32x8_t vs2 = P.VU.elt_group<EGU32x8_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*4 element group as a EGU32x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// and 'vs2' (constant, by value).
+#define VV_VD_VS2_EGU32x4_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \
+  EGU32x4_t &vd = P.VU.elt_group<EGU32x4_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x4_t vs2 = P.VU.elt_group<EGU32x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 32b*8 element group as a EGU32x8_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// and 'vs2' (constant, by value).
+#define VV_VD_VS2_EGU32x8_PARAMS(VD_NUM, VS2_NUM, EG_IDX) \
+  EGU32x8_t &vd = P.VU.elt_group<EGU32x8_t>((VD_NUM), (EG_IDX), true); \
+  const EGU32x8_t vs2 = P.VU.elt_group<EGU32x8_t>((VS2_NUM), (EG_IDX))
+
+// Extracts a 64b*4 element group as a EGU64x4_t variables at the given
+// element group index, from register arguments 'vd' (by reference, mutable),
+// 'vs1' and 'vs2' (constant, by value).
+#define VV_VD_VS1_VS2_EGU64x4_PARAMS(VD_NUM, VS1_NUM, VS2_NUM, EG_IDX) \
+  EGU64x4_t &vd = P.VU.elt_group<EGU64x4_t>((VD_NUM), (EG_IDX), true); \
+  const EGU64x4_t vs1 = P.VU.elt_group<EGU64x4_t>((VS1_NUM), (EG_IDX)); \
+  const EGU64x4_t vs2 = P.VU.elt_group<EGU64x4_t>((VS2_NUM), (EG_IDX))
+
+// Extracts elements from the vector register groups 'vd', 'vs2', and 'vs1',
+// as part of a widening operation where 'vd' has EEW = 2 * SEW.
+// Defines
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'vs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VV_WIDENING_U_PARAMS(SEW) \
+  auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+  const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+  const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+  const auto vs1 = P.VU.elt<type_usew_t<SEW>::type>(rs1_num, i); \
+
+// Extracts elements from the vector register groups 'vd', 'vs2',
+// and the scalar register 'rs1', as part of a widening operation where
+// 'vd' has EEW = 2 * SEW.
+// Defines
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'rs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VX_WIDENING_U_PARAMS(SEW) \
+  auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+  const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+  const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+  const auto rs1 = (type_usew_t<SEW>::type)RS1; \
+
+// Extracts elements from the vector register groups 'vd', 'vs2',
+// and the 5-bit immediate field 'zimm5', as part of a widening operation
+// where 'vd' has EEW = 2 * SEW.
+// Defines
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'zimm5', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VI_WIDENING_U_PARAMS(SEW) \
+  auto &vd_w = P.VU.elt<type_usew_t<2 * SEW>::type>(rd_num, i, true); \
+  const auto vs2 = P.VU.elt<type_usew_t<SEW>::type>(rs2_num, i); \
+  const type_usew_t<2 * SEW>::type vs2_w = vs2; \
+  const auto zimm5 = (type_usew_t<SEW>::type)insn.v_zimm5(); \
+
+//
+// Loop Macros
+//
+
+// NOTES:
+// - Each of the element-group loop macros DO contain an invocation
+//   of the corresponding 'require_element_groups_<bits>x<#elements>;',
+//   because the macro correctness requires proper VL/VSTART values.
+// - Each of the loop macros named "_NOVM_" DO contain an invocation
+//   of the 'require_no_vmask>;' macro. Those macros (all of them
+//   at this time) do not support masking (i.e., no skipping
+//   of elements/element groups is performed).
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd, vs1, and vs2.  This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x4_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs1': EGU32x4_t, content of the current element group
+//          in the 'vs1' vector register / register group.
+//   'vs2': EGU32x4_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS1_VS2_EGU32x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*8 element groups available in the vector register
+// operands vd, vs1, and vs2.  This interprets the vectors as containing
+// element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// IMPORTANT
+//  - This macro contains an invocation of the macro 'require_element_groups_32x8;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x8_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs1': EGU32x8_t, content of the current element group
+//          in the 'vs1' vector register / register group.
+//   'vs2': EGU32x8_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS1_VS2_EGU32x8_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_32x8;; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 8; \
+    const reg_t vl_eg = P.VU.vl->read() / 8; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS1_VS2_EGU32x8_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd, vs1, and vs2.  This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP:
+//  - this macro does NOT extract the element groups into EGU32x4_t
+//    variables. It is intended for uses where there is a more natural
+//    type to use (e.g., EGU8x16_t). The type should still be a 128 bits
+//    wide type if extracted via 'P.VU.elt_group<Type>(...)'.
+//  - this macro offers the additional PRELOOP code block argument,
+//    that is executed once if the loop is going to be entered.
+//    This is intended for use with "vector scalar" instructions where
+//    we extract the first element group from one of the operands and
+//    use it for all loop iterations.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//
+#define VI_ZVK_VD_VS1_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \
+                                                               PRELOOP, \
+                                                               EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector register
+// operands vd and vs2.  This interprets the vectors as containing
+// element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4), while
+// *ignoring* the current SEW setting of the vector unit.
+//
+// Compared to VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP:
+//  - this macro is meant to be used for "op vd, vs2" instructions,
+//    whether vd is output only, or input and output.
+//  - this macro does NOT extract the element groups into EGU32x4_t
+//    variables. It is intended for uses where there is a more natural
+//    type to use (e.g., EGU8x16_t). The type should still be a 128 bits
+//    wide type if extracted via 'P.VU.elt_group<Type>(...)'.
+//  - this macro offers the additional PRELOOP code block argument,
+//    that is executed once if the loop is going to be entered.
+//    This is intended for use with "vector scalar" instructions where
+//    we extract the first element group from one of the operands and
+//    use it for all loop iterations.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes three statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//
+#define VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(PRELUDE, \
+                                                           PRELOOP, \
+                                                           EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector registers
+// vd, vs2.  This interprets the vectors as containing element groups
+// of 4 uint32_t values (EGW=128, EEW=32, EGS=4),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x4_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs2': EGU32x4_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*4 element groups available in the vector registers
+// vd, vs2, given the 'zimm5' immediate.  This interprets the vectors as
+// containing element groups of 4 uint32_t values (EGW=128, EEW=32, EGS=4),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//
+// Invokes three statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'zimm5': 5 bits unsigned immediate
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x4_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs2': EGU32x4_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \
+  do { \
+    require_element_groups_32x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t zimm5 = insn.v_zimm5(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        VV_VD_VS2_EGU32x4_PARAMS(vd_num, vs2_num, idx_eg); \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 32b*8 element groups available in the vector registers
+// vd, vs2, given the 'zimm5' immediate.  This interprets the vectors as
+// containing element groups of 8 uint32_t values (EGW=256, EEW=32, EGS=8),
+// *ignoring* the current SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_32x8;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//
+// Invokes three statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - PRELOOP, invoked once IF there is at least one element group to process.
+//    It is NOT placed in its own scope, variables declared in PRELOOP are
+//    visible when EG_BODY executes.
+//    Pass {} when there is no need for such a pre-loop block.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs2_num': register index of vs2
+//   'zimm5': unsigned 5 bits immediate
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU32x8_t reference, mutable,, content of the current
+//         element group in the 'vd' vector register / register group.
+//   'vs2': EGU32x8_t, content of the current element group
+//          in the 'vs2' vector register / register group.
+//
+#define VI_ZVK_VD_VS2_ZIMM5_EGU32x8_NOVM_LOOP(PRELUDE, PRELOOP, EG_BODY) \
+  do { \
+    require_element_groups_32x8; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t zimm5 = insn.v_zimm5(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 8; \
+    const reg_t vl_eg = P.VU.vl->read() / 8; \
+    do { PRELUDE } while (0); \
+    if (vstart_eg < vl_eg) { \
+      PRELOOP \
+      for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+        VV_VD_VS2_EGU32x8_PARAMS(vd_num, vs2_num, idx_eg); \
+        EG_BODY \
+      } \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+// Processes all 64b*4 element groups available in the vector registers
+// vd, vs1, and vs2.  This interprets the vectors as containing element groups
+// of 4 uint64_t values (EGW=128, EEW=64, EGS=4), *ignoring* the current
+// SEW that applies to the vectors.
+//
+// IMPORTANT
+//  - This macro contains an invocation of 'require_element_groups_64x4;',
+//    since the "loop" macro correctness depends on invariants that
+//    are checked by the "require" macro.
+//  - This macro does not support masking, and contains an invocation
+//    of 'require_no_vmask;'.
+//  - While the name states "VD_VS1_VS2", many vector instructions
+//    are specified as "op vd, vs2, vs1". This macro does not imply
+//    a specific operand order and can be used with both "op vd, vs2, vs1"
+//    and "op vd, vs1, vs2" instructions.
+//
+// Invokes two statement blocks:
+//  - PRELUDE, invoked once, before any element group. It is executed even
+//    if the vector is empty. It is placed in a "do { } while (0);", hence
+//    any variable declared there is not visible outside.
+//  - EG_BODY, once per element group.
+//
+// Declares the following variables available for use in both statement blocks:
+//   'vd_num': register index of vd
+//   'vs1_num': register index of vs1
+//   'vs2_num': register index of vs2
+//   'vstart_eg': index of the first element group, *in EG units*
+//   'vl_eg': length of the vector, *in EG units*
+//
+// The following variables are available in the EG_BODY block:
+//   'idx_eg': index of the current element group.
+//   'vd': EGU64x4_t reference, content of the current element group
+//         in the 'vd' vector register / vector register group.
+//   'vs1': EGU64x4_t, content of the current element group
+//         in the 'vs1' vector register / vector register group.
+//   'vs2': EGU64x4_t, content of the current element group
+//         in the 'vs2' vector register / vector register group.
+#define VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(PRELUDE, EG_BODY) \
+  do { \
+    require_element_groups_64x4; \
+    require_no_vmask; \
+    const reg_t vd_num = insn.rd(); \
+    const reg_t vs1_num = insn.rs1(); \
+    const reg_t vs2_num = insn.rs2(); \
+    const reg_t vstart_eg = P.VU.vstart->read() / 4; \
+    const reg_t vl_eg = P.VU.vl->read() / 4; \
+    do { PRELUDE } while (0); \
+    for (reg_t idx_eg = vstart_eg; idx_eg < vl_eg; ++idx_eg) { \
+      VV_VD_VS1_VS2_EGU64x4_PARAMS(vd_num, vs1_num, vs2_num, idx_eg); \
+      EG_BODY \
+    } \
+    P.VU.vstart->write(0); \
+  } while (0)
+
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, v1',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'vs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VV_WIDENING_ULOOP(BODY) \
+  do { \
+    VI_CHECK_DSS(true); \
+    VI_LOOP_BASE \
+      switch (sew) { \
+        case e8: { \
+          VI_ZVK_VV_WIDENING_U_PARAMS(e8); \
+          BODY \
+          break; \
+        } \
+        case e16: { \
+          VI_ZVK_VV_WIDENING_U_PARAMS(e16); \
+          BODY \
+          break; \
+        } \
+        case e32: { \
+          VI_ZVK_VV_WIDENING_U_PARAMS(e32); \
+          BODY \
+          break; \
+        } \
+      } \
+    VI_LOOP_END \
+  } while (0)
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, rs1',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'rs1', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VX_WIDENING_ULOOP(BODY) \
+  do { \
+    VI_CHECK_DSS(true); \
+    VI_LOOP_BASE \
+      switch (sew) { \
+        case e8: { \
+          VI_ZVK_VX_WIDENING_U_PARAMS(e8); \
+          BODY \
+          break; \
+        } \
+        case e16: { \
+          VI_ZVK_VX_WIDENING_U_PARAMS(e16); \
+          BODY \
+          break; \
+        } \
+        case e32: { \
+          VI_ZVK_VX_WIDENING_U_PARAMS(e32); \
+          BODY \
+          break; \
+        } \
+      } \
+    VI_LOOP_END \
+  } while (0)
+
+// Loop macro for widening instructions taking parameters 'vd, vs2, zimm5',
+// with logic processing elements one-at-a-time in those register groups
+// and treating the elements as unsigned integers.
+//
+// Invokes the BODY statement block once per element.
+// As a widening instruction, it is defined for SEW in {8, 16, 32}.
+// A separate copy of BODY is instantiated for each SEW value.
+//
+// Declares the following variables available for use in BODY:
+//  - 'vd_w', unsigned, 2 * SEW width, by reference, mutable.
+//  - 'vs2', unsigned, SEW width, by value, constant.
+//  - 'vs2_w', unsigned, 2 * SEW width, by value, constant,
+//    a widened copy of 'vs2'.
+//  - 'zimm5', unsigned, SEW width, by value, constant.
+#define VI_ZVK_VI_WIDENING_ULOOP(BODY) \
+  do { \
+    VI_CHECK_DSS(true); \
+    VI_LOOP_BASE \
+      switch (sew) { \
+        case e8: { \
+          VI_ZVK_VI_WIDENING_U_PARAMS(e8); \
+          BODY \
+          break; \
+        } \
+        case e16: { \
+          VI_ZVK_VI_WIDENING_U_PARAMS(e16); \
+          BODY \
+          break; \
+        } \
+        case e32: { \
+          VI_ZVK_VI_WIDENING_U_PARAMS(e32); \
+          BODY \
+          break; \
+        } \
+      } \
+    VI_LOOP_END \
+  } while (0)
+
+//
+// Element Group Manipulation Macros
+//
+
+// Extracts 4 uint32_t words from the input EGU32x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Little Endian" (LE) order, i.e., from the least significant (W0)
+// to the most significant (W3).
+#define EXTRACT_EGU32x4_WORDS_LE(X, W0, W1, W2, W3) \
+  uint32_t W0 = (X)[0]; \
+  uint32_t W1 = (X)[1]; \
+  uint32_t W2 = (X)[2]; \
+  uint32_t W3 = (X)[3]; \
+  (void)(0)
+
+// Sets the elements words of given EGU32x4_t variable 'X' to
+// the given 4 uint32_t values privided in "Little Endian" (LE)
+// order, i.e., from the least significant (W0) to the most
+// significant (W3).
+#define SET_EGU32x4_LE(X, W0, W1, W2, W3) \
+  do { \
+    (X)[0] = (W0); \
+    (X)[1] = (W1); \
+    (X)[2] = (W2); \
+    (X)[3] = (W3); \
+  } while (0)
+
+// Extracts 4 uint32_t words from the input EGU32x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W3)
+// to the least significant (W0).
+#define EXTRACT_EGU32x4_WORDS_BE(X, W3, W2, W1, W0) \
+  uint32_t W0 = (X)[0]; \
+  uint32_t W1 = (X)[1]; \
+  uint32_t W2 = (X)[2]; \
+  uint32_t W3 = (X)[3]; \
+  (void)(0)
+
+// Sets the elements words of given EGU32x4_t variable 'X' to
+// the given 4 uint32_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W3) to the least
+// significant (W0).
+#define SET_EGU32x4_BE(X, W3, W2, W1, W0) \
+  do { \
+    (X)[0] = (W0); \
+    (X)[1] = (W1); \
+    (X)[2] = (W2); \
+    (X)[3] = (W3); \
+  } while (0)
+
+// Byte-swap the bytes of a uin32_t such that the order of bytes
+// is reversed.
+#define ZVK_BSWAP32(x) \
+  ((((uint32_t)((x) >> 24)) & 0xFF) <<  0 | \
+   (((uint32_t)((x) >> 16)) & 0xFF) <<  8 | \
+   (((uint32_t)((x) >>  8)) & 0xFF) << 16 | \
+   (((uint32_t)((x) >>  0)) & 0xFF) << 24)
+
+// Extracts 8 uint32_t words from the input EGU32x8_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W7)
+// to the least significant (W0). Each of the words is byte-swapped,
+// from a big-endian representation in the EGU32x8_t to a native/little-endian
+// ordering in the variables.
+#define EXTRACT_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \
+  uint32_t W0 = ZVK_BSWAP32((X)[0]); \
+  uint32_t W1 = ZVK_BSWAP32((X)[1]); \
+  uint32_t W2 = ZVK_BSWAP32((X)[2]); \
+  uint32_t W3 = ZVK_BSWAP32((X)[3]); \
+  uint32_t W4 = ZVK_BSWAP32((X)[4]); \
+  uint32_t W5 = ZVK_BSWAP32((X)[5]); \
+  uint32_t W6 = ZVK_BSWAP32((X)[6]); \
+  uint32_t W7 = ZVK_BSWAP32((X)[7]); \
+  (void)(0)
+
+// Sets the elements words of given EGU32x8_t variable 'X' to
+// the given 8 uint32_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W7) to the least
+// significant (W0). Each of the words is byte-swapped,
+// from a native/little-endian ordering in the variables to
+// a big-endian representation in the EGU32x8_t.
+#define SET_EGU32x8_WORDS_BE_BSWAP(X, W7, W6, W5, W4, W3, W2, W1, W0) \
+  do { \
+    (X)[0] = ZVK_BSWAP32(W0); \
+    (X)[1] = ZVK_BSWAP32(W1); \
+    (X)[2] = ZVK_BSWAP32(W2); \
+    (X)[3] = ZVK_BSWAP32(W3); \
+    (X)[4] = ZVK_BSWAP32(W4); \
+    (X)[5] = ZVK_BSWAP32(W5); \
+    (X)[6] = ZVK_BSWAP32(W6); \
+    (X)[7] = ZVK_BSWAP32(W7); \
+  } while (0)
+
+// Extracts 4 uint64_t words from the input EGU64x4_t value
+// into the (mutable) variables named by the W arguments, provided in
+// "Big Endian" (BE) order, i.e., from the most significant (W3)
+// to the least significant (W0).
+#define EXTRACT_EGU64x4_WORDS_BE(X, W3, W2, W1, W0) \
+  uint64_t W0 = (X)[0]; \
+  uint64_t W1 = (X)[1]; \
+  uint64_t W2 = (X)[2]; \
+  uint64_t W3 = (X)[3]; \
+  (void)(0)
+
+// Sets the elements words of given EGU64x4_t variable 'X' to
+// the given 4 uint64_t values privided in "Big Endian" (BE)
+// order, i.e., from the most significant (W3) to the least
+// significant (W0).
+#define SET_EGU64x4_BE(X, W3, W2, W1, W0) \
+  do { \
+    (X)[0] = (W0); \
+    (X)[1] = (W1); \
+    (X)[2] = (W2); \
+    (X)[3] = (W3); \
+  } while (0)
+
+// Copies a EGU8x16_t value from 'SRC' into 'DST'.
+#define EGU8x16_COPY(DST, SRC) \
+  for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+    (DST)[bidx] = (SRC)[bidx]; \
+  }
+
+// Performs  "MUT_A ^= CONST_B;", i.e., xor of the bytes
+// in A (mutated) with the bytes in B (unchanged).
+#define EGU8x16_XOREQ(MUT_A, CONST_B) \
+  for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+    (MUT_A)[bidx] ^= (CONST_B)[bidx]; \
+  }
+
+// Performs  "MUT_A ^= CONST_B;", i.e., xor of the bytes
+// in A (mutated) with the bytes in B (unchanged).
+#define EGU32x4_XOREQ(MUT_A, CONST_B) \
+  for (std::size_t bidx = 0; bidx < 4; ++bidx) { \
+    (MUT_A)[bidx] ^= (CONST_B)[bidx]; \
+  }
+
+// Performs  "DST = A ^ B;", i.e., DST (overwritten) receives
+// the xor of the bytes in A and B (both unchanged).
+#define EGU8x16_XOR(DST, A, B) \
+  for (std::size_t bidx = 0; bidx < 16; ++bidx) { \
+    (DST)[bidx] = (A)[bidx] ^ (B)[bidx]; \
+  }
+
+//
+// Common bit manipulations logic.
+//
+
+// Form a 64 bit integer with bit X set
+#define ZVK_BIT(X) (1ULL << (X))
+
+// Reverse the order of bits within bytes of a word.
+// This is used to match the data interpretation in NIST SP 800-38D
+// a.k.a the GCM specification.
+#define ZVK_BREV8_32(X) \
+  do { \
+    (X) = (((X) & 0x55555555) << 1) | (((X) & 0xaaaaaaaa) >> 1); \
+    (X) = (((X) & 0x33333333) << 2) | (((X) & 0xcccccccc) >> 2); \
+    (X) = (((X) & 0x0f0f0f0f) << 4) | (((X) & 0xf0f0f0f0) >> 4); \
+  } while (0)
+
+// Rotates right a uint32_t value by N bits.
+//   uint32_t ROR32(uint32_t X, std::size_t N);
+#define ZVK_ROR32(X, N) rotate_right<uint32_t>((X), (N))
+
+// Rotates right a uint64_t value by N bits.
+//   uint64_t ROR64(uint64_t X, std::size_t N);
+#define ZVK_ROR64(X, N) rotate_right<uint64_t>((X), (N))
+
+// Rotates left a uint32_t value by N bits.
+//   uint32_t ROL32(uint32_t X, std::size_t N);
+#define ZVK_ROL32(X, N) rotate_left<uint32_t>((X), (N))
+
+//
+// Element Group Bit Manipulation Macros
+//
+
+// Performs bit reversal in a EGU32x4_t group.
+#define EGU32x4_BREV8(X) \
+  for (std::size_t bidx = 0; bidx < 4; ++bidx) { \
+    ZVK_BREV8_32((X)[bidx]); \
+  }
+
+// Checks if a given bit is set within a EGU32x4_t group.
+// Assumes LE ordering.
+#define EGU32x4_ISSET(X, BIDX) \
+  (((X)[(BIDX) / 32] & ZVK_BIT((BIDX) % 32)) != 0)
+
+// Shfts a EGU32x4_t group left by one bit.
+//
+// Since the entire 128 bit value is shifted we need to handle carry bits.
+// In order to limit the amount of carry check logic the elements are copied to
+// a 64 bit temporary variable.
+#define EGU32x4_LSHIFT(X) \
+  do { \
+    uint64_t dword; \
+    dword = ((uint64_t)(X)[3]) << 32; \
+    dword |= X[2]; \
+    dword <<= 1; \
+    if (X[1] & ZVK_BIT(31)) { \
+      dword |= ZVK_BIT(0); \
+    } \
+    X[2] = dword & UINT32_MAX; \
+    X[3] = dword >> 32; \
+    dword = ((uint64_t)(X)[1]) << 32; \
+    dword |= X[0]; \
+    dword <<= 1; \
+    X[0] = dword & UINT32_MAX; \
+    X[1] = dword >> 32; \
+  } while (0)
+
+#endif  // RISCV_ZVK_EXT_MACROS_H_

From e87038ee5e6545a5149cdf4334d220f951534f30 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:06:55 -0700
Subject: [PATCH 078/110] Zvk: Implement Zvbb, Vector Bit-manipulation for
 Cryptography

Implement the proposed instructions in Zvbb:
 - vandn.{vv,vx}, vector bitwise and-not
 - vbrev.v, vector bit reverse in element
 - vbrev8.v, vector bit reverse in bytes
 - vrev8.v, vector byte reverse
 - vctz.v, vector count trailing zeros
 - vclz.v, vector count leading zeros
 - vcpop.v, vector population count
 - vrol.{vv,vx}, vector rotate left
 - vror.{vi,vv,vx}, vector rotate right
 - vwsll.{vi,vv,vx} vector widening shift left logical

A new instruction field, 'zimm6', is introduced, encoded
in bits [15, 19] and [26].. It is used by "vror.vi" to encode
a shift immediate in [0, 63].

Co-authored-by: Raghav Gupta <rgupta@rivosinc.com>
Co-authored-by: Stanislaw Kardach <kda@semihalf.com>
Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/decode.h         |  1 +
 riscv/insns/vandn_vv.h | 10 ++++++++++
 riscv/insns/vandn_vx.h | 10 ++++++++++
 riscv/insns/vbrev8_v.h | 13 +++++++++++++
 riscv/insns/vbrev_v.h  | 24 ++++++++++++++++++++++++
 riscv/insns/vclz_v.h   | 16 ++++++++++++++++
 riscv/insns/vcpop_v.h  | 16 ++++++++++++++++
 riscv/insns/vctz_v.h   | 16 ++++++++++++++++
 riscv/insns/vrev8_v.h  | 16 ++++++++++++++++
 riscv/insns/vrol_vv.h  | 17 +++++++++++++++++
 riscv/insns/vrol_vx.h  | 18 ++++++++++++++++++
 riscv/insns/vror_vi.h  | 18 ++++++++++++++++++
 riscv/insns/vror_vv.h  | 17 +++++++++++++++++
 riscv/insns/vror_vx.h  | 18 ++++++++++++++++++
 riscv/insns/vwsll_vi.h | 10 ++++++++++
 riscv/insns/vwsll_vv.h | 10 ++++++++++
 riscv/insns/vwsll_vx.h | 10 ++++++++++
 riscv/riscv.mk.in      | 22 ++++++++++++++++++++++
 18 files changed, 262 insertions(+)
 create mode 100644 riscv/insns/vandn_vv.h
 create mode 100644 riscv/insns/vandn_vx.h
 create mode 100644 riscv/insns/vbrev8_v.h
 create mode 100644 riscv/insns/vbrev_v.h
 create mode 100644 riscv/insns/vclz_v.h
 create mode 100644 riscv/insns/vcpop_v.h
 create mode 100644 riscv/insns/vctz_v.h
 create mode 100644 riscv/insns/vrev8_v.h
 create mode 100644 riscv/insns/vrol_vv.h
 create mode 100644 riscv/insns/vrol_vx.h
 create mode 100644 riscv/insns/vror_vi.h
 create mode 100644 riscv/insns/vror_vv.h
 create mode 100644 riscv/insns/vror_vx.h
 create mode 100644 riscv/insns/vwsll_vi.h
 create mode 100644 riscv/insns/vwsll_vv.h
 create mode 100644 riscv/insns/vwsll_vx.h

diff --git a/riscv/decode.h b/riscv/decode.h
index dad32a1e31..cd1c0a1222 100644
--- a/riscv/decode.h
+++ b/riscv/decode.h
@@ -140,6 +140,7 @@ class insn_t
   uint64_t v_vta() { return x(26, 1); }
   uint64_t v_vma() { return x(27, 1); }
   uint64_t v_mew() { return x(28, 1); }
+  uint64_t v_zimm6() { return x(15, 5) + (x(26, 1) << 5); }
 
   uint64_t p_imm2() { return x(20, 2); }
   uint64_t p_imm3() { return x(20, 3); }
diff --git a/riscv/insns/vandn_vv.h b/riscv/insns/vandn_vv.h
new file mode 100644
index 0000000000..d85e47d7fe
--- /dev/null
+++ b/riscv/insns/vandn_vv.h
@@ -0,0 +1,10 @@
+// vandn.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_VV_LOOP
+({
+  vd = vs2 & (~vs1);
+})
diff --git a/riscv/insns/vandn_vx.h b/riscv/insns/vandn_vx.h
new file mode 100644
index 0000000000..1c66a40970
--- /dev/null
+++ b/riscv/insns/vandn_vx.h
@@ -0,0 +1,10 @@
+// vandn.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_VX_LOOP
+({
+  vd = vs2 & (~rs1);
+})
diff --git a/riscv/insns/vbrev8_v.h b/riscv/insns/vbrev8_v.h
new file mode 100644
index 0000000000..a6d3cda744
--- /dev/null
+++ b/riscv/insns/vbrev8_v.h
@@ -0,0 +1,13 @@
+// vbrev8.v vd, vs2, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  vd = vs2;
+  vd = ((vd & 0x5555555555555555llu) <<  1) | ((vd & 0xAAAAAAAAAAAAAAAAllu) >>  1);
+  vd = ((vd & 0x3333333333333333llu) <<  2) | ((vd & 0xCCCCCCCCCCCCCCCCllu) >>  2);
+  vd = ((vd & 0x0F0F0F0F0F0F0F0Fllu) <<  4) | ((vd & 0xF0F0F0F0F0F0F0F0llu) >>  4);
+})
diff --git a/riscv/insns/vbrev_v.h b/riscv/insns/vbrev_v.h
new file mode 100644
index 0000000000..7f784c2231
--- /dev/null
+++ b/riscv/insns/vbrev_v.h
@@ -0,0 +1,24 @@
+// vbrev.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  reg_t x = vs2;
+
+  // Reverse bits in bytes (vbrev8)
+  x = ((x & 0x5555555555555555llu) <<  1) | ((x & 0xAAAAAAAAAAAAAAAAllu) >>  1);
+  x = ((x & 0x3333333333333333llu) <<  2) | ((x & 0xCCCCCCCCCCCCCCCCllu) >>  2);
+  x = ((x & 0x0F0F0F0F0F0F0F0Fllu) <<  4) | ((x & 0xF0F0F0F0F0F0F0F0llu) >>  4);
+  // Re-order bytes (vrev8)
+  if (P.VU.vsew > 8)
+    x = ((x & 0x00FF00FF00FF00FFllu) <<  8) | ((x & 0xFF00FF00FF00FF00llu) >>  8);
+  if (P.VU.vsew > 16)
+    x = ((x & 0x0000FFFF0000FFFFllu) << 16) | ((x & 0xFFFF0000FFFF0000llu) >> 16);
+  if (P.VU.vsew > 32)
+    x = ((x & 0x00000000FFFFFFFFllu) << 32) | ((x & 0xFFFFFFFF00000000llu) >> 32);
+
+  vd = x;
+})
diff --git a/riscv/insns/vclz_v.h b/riscv/insns/vclz_v.h
new file mode 100644
index 0000000000..5f7f03c86c
--- /dev/null
+++ b/riscv/insns/vclz_v.h
@@ -0,0 +1,16 @@
+// vclz.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  unsigned int i = 0;
+  for (; i < P.VU.vsew; ++i) {
+    if (1 & (vs2 >> (P.VU.vsew - 1 - i))) {
+      break;
+    }
+  }
+  vd = i;
+})
diff --git a/riscv/insns/vcpop_v.h b/riscv/insns/vcpop_v.h
new file mode 100644
index 0000000000..52b29c695c
--- /dev/null
+++ b/riscv/insns/vcpop_v.h
@@ -0,0 +1,16 @@
+// vpopc.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  reg_t count = 0;
+  for (std::size_t i = 0; i < P.VU.vsew; ++i) {
+    if (1 & (vs2 >> i)) {
+      count++;
+    }
+  }
+  vd = count;
+})
diff --git a/riscv/insns/vctz_v.h b/riscv/insns/vctz_v.h
new file mode 100644
index 0000000000..b63dd019f7
--- /dev/null
+++ b/riscv/insns/vctz_v.h
@@ -0,0 +1,16 @@
+// vctz.v vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  unsigned int i = 0;
+  for (; i < P.VU.vsew; ++i) {
+    if (1 & (vs2 >> i)) {
+      break;
+    }
+  }
+  vd = i;
+})
diff --git a/riscv/insns/vrev8_v.h b/riscv/insns/vrev8_v.h
new file mode 100644
index 0000000000..f26c5a0502
--- /dev/null
+++ b/riscv/insns/vrev8_v.h
@@ -0,0 +1,16 @@
+// vrev8.v vd, vs2, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_V_ULOOP
+({
+  vd = vs2;
+  if (P.VU.vsew > 8)
+    vd = ((vd & 0x00FF00FF00FF00FFllu) <<  8) | ((vd & 0xFF00FF00FF00FF00llu) >>  8);
+  if (P.VU.vsew > 16)
+    vd = ((vd & 0x0000FFFF0000FFFFllu) << 16) | ((vd & 0xFFFF0000FFFF0000llu) >> 16);
+  if (P.VU.vsew > 32)
+    vd = ((vd & 0x00000000FFFFFFFFllu) << 32) | ((vd & 0xFFFFFFFF00000000llu) >> 32);
+})
diff --git a/riscv/insns/vrol_vv.h b/riscv/insns/vrol_vv.h
new file mode 100644
index 0000000000..fb2e483320
--- /dev/null
+++ b/riscv/insns/vrol_vv.h
@@ -0,0 +1,17 @@
+// vrol.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+VI_VV_ULOOP
+({
+  // For .vv, the shift amount comes from the vs1 element.
+  const reg_t lshift = vs1 & mask;
+  const reg_t rshift = (-lshift) & mask;
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vrol_vx.h b/riscv/insns/vrol_vx.h
new file mode 100644
index 0000000000..b0c89a27b7
--- /dev/null
+++ b/riscv/insns/vrol_vx.h
@@ -0,0 +1,18 @@
+// vrol.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+// For .vx, the shift amount comes from rs1.
+const reg_t lshift = ((reg_t)RS1) & mask;
+const reg_t rshift = (-lshift) & mask;
+
+VI_V_ULOOP
+({
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vror_vi.h b/riscv/insns/vror_vi.h
new file mode 100644
index 0000000000..1269c3d477
--- /dev/null
+++ b/riscv/insns/vror_vi.h
@@ -0,0 +1,18 @@
+// vror.vi vd, vs2, zimm6, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+// For .vi, the shift amount comes from bits [26,19-15].
+const reg_t rshift = insn.v_zimm6() & mask;
+const reg_t lshift = (-rshift) & mask;
+
+VI_V_ULOOP
+({
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vror_vv.h b/riscv/insns/vror_vv.h
new file mode 100644
index 0000000000..c649c6d97f
--- /dev/null
+++ b/riscv/insns/vror_vv.h
@@ -0,0 +1,17 @@
+// vror.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+VI_VV_ULOOP
+({
+  // For .vv, the shift amount comes from the vs1 element.
+  const reg_t rshift = vs1 & mask;
+  const reg_t lshift = (-rshift) & mask;
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vror_vx.h b/riscv/insns/vror_vx.h
new file mode 100644
index 0000000000..50c8e5c94a
--- /dev/null
+++ b/riscv/insns/vror_vx.h
@@ -0,0 +1,18 @@
+// vror.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+// 'mask' selects the low log2(vsew) bits of the shift amount,
+// to limit the maximum shift to "vsew - 1" bits.
+const reg_t mask = P.VU.vsew - 1;
+
+// For .vx, the shift amount comes from rs1.
+const reg_t rshift = ((reg_t)RS1) & mask;
+const reg_t lshift = (-rshift) & mask;
+
+VI_V_ULOOP
+({
+  vd = (vs2 << lshift) | (vs2 >> rshift);
+})
diff --git a/riscv/insns/vwsll_vi.h b/riscv/insns/vwsll_vi.h
new file mode 100644
index 0000000000..13b5eb4a5b
--- /dev/null
+++ b/riscv/insns/vwsll_vi.h
@@ -0,0 +1,10 @@
+// vwsll.vi vd, vs2, zimm5, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_ZVK_VI_WIDENING_ULOOP({
+  const reg_t shift = zimm5 & ((2 * sew) - 1);
+  vd_w = vs2_w << shift;
+});
diff --git a/riscv/insns/vwsll_vv.h b/riscv/insns/vwsll_vv.h
new file mode 100644
index 0000000000..5a64c6c06a
--- /dev/null
+++ b/riscv/insns/vwsll_vv.h
@@ -0,0 +1,10 @@
+// vwsll.vv vd, vs2, zimm5, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_ZVK_VV_WIDENING_ULOOP({
+  const reg_t shift = (vs1 & ((2 * sew) - 1));
+  vd_w = vs2_w << shift;
+});
diff --git a/riscv/insns/vwsll_vx.h b/riscv/insns/vwsll_vx.h
new file mode 100644
index 0000000000..5264e80eac
--- /dev/null
+++ b/riscv/insns/vwsll_vx.h
@@ -0,0 +1,10 @@
+// vwsll.vx vd, vs2, zimm5, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbb;
+
+VI_ZVK_VX_WIDENING_ULOOP({
+  const reg_t shift = (rs1 & ((2 * sew) - 1));
+  vd_w = vs2_w << shift;
+});
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 3b493a06c8..4aa23e3c1c 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1340,6 +1340,27 @@ riscv_insn_ext_zacas = \
 	amocas_d \
 	$(if $(HAVE_INT128),amocas_q)
 
+riscv_insn_ext_zvbb = \
+	vandn_vv \
+	vandn_vx \
+	vbrev8_v \
+	vbrev_v \
+	vclz_v \
+	vcpop_v \
+	vctz_v \
+	vrev8_v \
+	vrol_vv \
+	vrol_vx \
+	vror_vi \
+	vror_vv \
+	vror_vx \
+	vwsll_vi \
+	vwsll_vv \
+	vwsll_vx \
+
+riscv_insn_ext_zvk = \
+	$(riscv_insn_ext_zvbb) \
+
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
 	$(riscv_insn_ext_a) \
@@ -1363,6 +1384,7 @@ riscv_insn_list = \
 	$(riscv_insn_ext_zfh) \
 	$(riscv_insn_ext_zfh_zfa) \
 	$(riscv_insn_ext_zicond) \
+	$(riscv_insn_ext_zvk) \
 	$(riscv_insn_priv) \
 	$(riscv_insn_smrnmi) \
 	$(riscv_insn_svinval) \

From d633af2b180391b6f73f84f56d8b305a3af7c152 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:07:04 -0700
Subject: [PATCH 079/110] Zvk: Implement Zvbc extension, vectory carryless
 multiplaction

Implement the Zvbc instructions
- vclmul.{vv,vx}, vector carryless multiply low
- vclmulh.{vv,vx}, vector carryless multiply high

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/insns/vclmul_vv.h  | 20 ++++++++++++++++++++
 riscv/insns/vclmul_vx.h  | 20 ++++++++++++++++++++
 riscv/insns/vclmulh_vv.h | 20 ++++++++++++++++++++
 riscv/insns/vclmulh_vx.h | 20 ++++++++++++++++++++
 riscv/riscv.mk.in        |  7 +++++++
 5 files changed, 87 insertions(+)
 create mode 100644 riscv/insns/vclmul_vv.h
 create mode 100644 riscv/insns/vclmul_vx.h
 create mode 100644 riscv/insns/vclmulh_vv.h
 create mode 100644 riscv/insns/vclmulh_vx.h

diff --git a/riscv/insns/vclmul_vv.h b/riscv/insns/vclmul_vv.h
new file mode 100644
index 0000000000..8957738adc
--- /dev/null
+++ b/riscv/insns/vclmul_vv.h
@@ -0,0 +1,20 @@
+// vclmul.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VV_ULOOP
+({
+  // Perform a carryless multiplication 64bx64b on each 64b element,
+  // return the low 64b of the 128b product.
+  //   <https://en.wikipedia.org/wiki/Carry-less_product>
+  vd = 0;
+  for (std::size_t bit_idx = 0; bit_idx < sew; ++bit_idx) {
+    const reg_t mask = ((reg_t) 1) << bit_idx;
+    if ((vs1 & mask) != 0) {
+      vd ^= vs2 << bit_idx;
+    }
+  }
+})
diff --git a/riscv/insns/vclmul_vx.h b/riscv/insns/vclmul_vx.h
new file mode 100644
index 0000000000..1df7a3a2a4
--- /dev/null
+++ b/riscv/insns/vclmul_vx.h
@@ -0,0 +1,20 @@
+// vclmul.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VX_ULOOP
+({
+  // Perform a carryless multiplication 64bx64b on each 64b element,
+  // return the low 64b of the 128b product.
+  //   <https://en.wikipedia.org/wiki/Carry-less_product>
+  vd = 0;
+  for (std::size_t bit_idx = 0; bit_idx < sew; ++bit_idx) {
+    const reg_t mask = ((reg_t) 1) << bit_idx;
+    if ((rs1 & mask) != 0) {
+        vd ^= vs2 << bit_idx;
+    }
+  }
+})
diff --git a/riscv/insns/vclmulh_vv.h b/riscv/insns/vclmulh_vv.h
new file mode 100644
index 0000000000..6a54bcfaa6
--- /dev/null
+++ b/riscv/insns/vclmulh_vv.h
@@ -0,0 +1,20 @@
+// vclmulh.vv vd, vs2, vs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VV_ULOOP
+({
+  // Perform a carryless multiplication 64bx64b on each 64b element,
+  // return the high 64b of the 128b product.
+  //   <https://en.wikipedia.org/wiki/Carry-less_product>
+  vd = 0;
+  for (std::size_t bit_idx = 1; bit_idx < sew; ++bit_idx) {
+    const reg_t mask = ((reg_t) 1) << bit_idx;
+    if ((vs1 & mask) != 0) {
+      vd ^= ((reg_t)vs2) >> (sew - bit_idx);
+    }
+  }
+})
diff --git a/riscv/insns/vclmulh_vx.h b/riscv/insns/vclmulh_vx.h
new file mode 100644
index 0000000000..e874d1df68
--- /dev/null
+++ b/riscv/insns/vclmulh_vx.h
@@ -0,0 +1,20 @@
+// vclmulh.vx vd, vs2, rs1, vm
+
+#include "zvk_ext_macros.h"
+
+require_zvbc;
+require(P.VU.vsew == 64);
+
+VI_VX_ULOOP
+({
+  // Perform a carryless multiplication 64bx64b on each 64b element,
+  // return the high 64b of the 128b product.
+  //   <https://en.wikipedia.org/wiki/Carry-less_product>
+  vd = 0;
+  for (std::size_t bit_idx = 1; bit_idx < sew; ++bit_idx) {
+    const reg_t mask = ((reg_t) 1) << bit_idx;
+    if ((rs1 & mask) != 0) {
+      vd ^= ((reg_t)vs2) >> (sew - bit_idx);
+    }
+  }
+})
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 4aa23e3c1c..dcf2640600 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1358,8 +1358,15 @@ riscv_insn_ext_zvbb = \
 	vwsll_vv \
 	vwsll_vx \
 
+riscv_insn_ext_zvbc = \
+	vclmul_vv \
+	vclmul_vx \
+	vclmulh_vv \
+	vclmulh_vx \
+
 riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
+	$(riscv_insn_ext_zvbc) \
 
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \

From fbd4ca2eef884b6835e848d761b3e375a66fc47a Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:07:22 -0700
Subject: [PATCH 080/110] Zvk: Implement Zvkg, Vector GCM/GMAC instruction

Implement the proposed instruction in Zvkg, vghmac.vv,
Vector Carryless Multiply Accumulate over GHASH Galois-Field.

The instruction performs one step of GHASH routine as described
in "NIST Special Publication 800-38D" a.k.a the AES-GCM specification.
The logic was written to closely track the pseudo-code
in the Zvk specification.

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
Co-authored-by: Kornel Duleba <mindal@semihalf.com>
Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/insns/vghsh_vv.h | 38 ++++++++++++++++++++++++++++++++++++++
 riscv/insns/vgmul_vv.h | 32 ++++++++++++++++++++++++++++++++
 riscv/riscv.mk.in      |  5 +++++
 riscv/zvk_ext_macros.h | 16 ++++++++++++++--
 4 files changed, 89 insertions(+), 2 deletions(-)
 create mode 100644 riscv/insns/vghsh_vv.h
 create mode 100644 riscv/insns/vgmul_vv.h

diff --git a/riscv/insns/vghsh_vv.h b/riscv/insns/vghsh_vv.h
new file mode 100644
index 0000000000..bcbfe74f33
--- /dev/null
+++ b/riscv/insns/vghsh_vv.h
@@ -0,0 +1,38 @@
+// vghsh.vv vd, vs2, vs1
+
+#include "zvk_ext_macros.h"
+
+require_zvkg;
+require(P.VU.vsew == 32);
+require_egw_fits(128);
+
+VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+  {},
+  {
+    EGU32x4_t Y = vd;   // Current partial hash
+    EGU32x4_t X = vs1;  // Block cipher output
+    EGU32x4_t H = vs2;  // Hash subkey
+
+    EGU32x4_BREV8(H);
+    EGU32x4_t Z = {};
+
+    // S = brev8(Y ^ X)
+    EGU32x4_t S;
+    EGU32x4_XOR(S, Y, X);
+    EGU32x4_BREV8(S);
+
+    for (int bit = 0; bit < 128; bit++) {
+      if (EGU32x4_ISSET(S, bit)) {
+        EGU32x4_XOREQ(Z, H);
+      }
+
+      const bool reduce = EGU32x4_ISSET(H, 127);
+      EGU32x4_LSHIFT(H);  // Left shift by 1.
+      if (reduce) {
+        H[0] ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial
+      }
+    }
+    EGU32x4_BREV8(Z);
+    vd = Z;
+  }
+);
diff --git a/riscv/insns/vgmul_vv.h b/riscv/insns/vgmul_vv.h
new file mode 100644
index 0000000000..820b396e04
--- /dev/null
+++ b/riscv/insns/vgmul_vv.h
@@ -0,0 +1,32 @@
+// vgmul.vv vd, vs2
+
+#include "zvk_ext_macros.h"
+
+require_zvkg;
+require(P.VU.vsew == 32);
+require_egw_fits(128);
+
+VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(
+  {},
+  {
+    EGU32x4_t Y = vd;  // Multiplier
+    EGU32x4_BREV8(Y);
+    EGU32x4_t H = vs2;  // Multiplicand
+    EGU32x4_BREV8(H);
+    EGU32x4_t Z = {};
+
+    for (int bit = 0; bit < 128; bit++) {
+      if (EGU32x4_ISSET(Y, bit)) {
+        EGU32x4_XOREQ(Z, H);
+      }
+
+      bool reduce = EGU32x4_ISSET(H, 127);
+      EGU32x4_LSHIFT(H);  // Lef shift by 1
+      if (reduce) {
+        H[0] ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial
+      }
+    }
+    EGU32x4_BREV8(Z);
+    vd = Z;
+  }
+);
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index dcf2640600..5562c0956d 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1364,9 +1364,14 @@ riscv_insn_ext_zvbc = \
 	vclmulh_vv \
 	vclmulh_vx \
 
+riscv_insn_ext_zvkg= \
+	vghsh_vv \
+	vgmul_vv \
+
 riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
 	$(riscv_insn_ext_zvbc) \
+	$(riscv_insn_ext_zvkg) \
 
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
diff --git a/riscv/zvk_ext_macros.h b/riscv/zvk_ext_macros.h
index 7efbac806f..bf893f9f12 100644
--- a/riscv/zvk_ext_macros.h
+++ b/riscv/zvk_ext_macros.h
@@ -942,8 +942,8 @@
 // Performs  "MUT_A ^= CONST_B;", i.e., xor of the bytes
 // in A (mutated) with the bytes in B (unchanged).
 #define EGU32x4_XOREQ(MUT_A, CONST_B) \
-  for (std::size_t bidx = 0; bidx < 4; ++bidx) { \
-    (MUT_A)[bidx] ^= (CONST_B)[bidx]; \
+  for (std::size_t idx = 0; idx < 4; ++idx) { \
+    (MUT_A)[idx] ^= (CONST_B)[idx]; \
   }
 
 // Performs  "DST = A ^ B;", i.e., DST (overwritten) receives
@@ -953,6 +953,18 @@
     (DST)[bidx] = (A)[bidx] ^ (B)[bidx]; \
   }
 
+// Performs  "DST = A ^ B;", i.e., DST (overwritten) receives
+// the xor of the bytes in A and B (both unchanged).
+#define EGU32x4_XOR(DST, A, B) \
+  do { \
+    static_assert(std::is_same<EGU32x4_t, decltype(A)>::value); \
+    static_assert(std::is_same<EGU32x4_t, decltype(B)>::value); \
+    static_assert(std::is_same<EGU32x4_t, decltype(DST)>::value); \
+    for (std::size_t idx = 0; idx < 4; ++idx) { \
+      (DST)[idx] = (A)[idx] ^ (B)[idx]; \
+    } \
+  } while (0)
+
 //
 // Common bit manipulations logic.
 //

From 00873aa61acae4a17c1d269cddf1885e83b50102 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:07:32 -0700
Subject: [PATCH 081/110] Zvk: Implement Zvknh[ab], NIST Suite: Vector SHA-2

Implement the instructions part of the Zvknha and Zvknhb
sub-extensions:
 - vsha2ms.vv, message schedule
 - vsha2ch.vv / vsha2cl.vv, compression rounds

A header files for common macros is added.

Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/insns/vsha2ch_vv.h |  61 +++++++++++++++
 riscv/insns/vsha2cl_vv.h |  62 ++++++++++++++++
 riscv/insns/vsha2ms_vv.h |  63 ++++++++++++++++
 riscv/riscv.mk.in        |   7 ++
 riscv/zvknh_ext_macros.h | 155 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 348 insertions(+)
 create mode 100644 riscv/insns/vsha2ch_vv.h
 create mode 100644 riscv/insns/vsha2cl_vv.h
 create mode 100644 riscv/insns/vsha2ms_vv.h
 create mode 100644 riscv/zvknh_ext_macros.h

diff --git a/riscv/insns/vsha2ch_vv.h b/riscv/insns/vsha2ch_vv.h
new file mode 100644
index 0000000000..34c6e05fbc
--- /dev/null
+++ b/riscv/insns/vsha2ch_vv.h
@@ -0,0 +1,61 @@
+// vsha2ch.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+  case e32: {
+    require_vsha2_vsew32_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU32x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU32x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU32x4_WORDS_BE(vs1, kw3, kw2,
+                                 UNUSED _unused_kw1, UNUSED _unused_kw0);
+
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw2);
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw3);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU32x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  case e64: {
+    require_vsha2_vsew64_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU64x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU64x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU64x4_WORDS_BE(vs1, kw3, kw2,
+                                 UNUSED _unused_kw1, UNUSED _unused_kw0);
+
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw2);
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw3);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU64x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  // 'require_vsha2_common_constraints' ensures that
+  // VSEW is either 32 or 64.
+  default:
+    require(false);
+}
diff --git a/riscv/insns/vsha2cl_vv.h b/riscv/insns/vsha2cl_vv.h
new file mode 100644
index 0000000000..4a1df0904b
--- /dev/null
+++ b/riscv/insns/vsha2cl_vv.h
@@ -0,0 +1,62 @@
+// vsha2cl.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+  case e32: {
+    require_vsha2_vsew32_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU32x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU32x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU32x4_WORDS_BE(vs1, UNUSED _unused_kw3, UNUSED _unused_kw2,
+                                 kw1, kw0);
+
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw0);
+        ZVK_SHA256_COMPRESS(a, b, c, d, e, f, g, h, kw1);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU32x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  case e64: {
+    require_vsha2_vsew64_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+      {},
+      {
+        // {c, d, g, h} <- vd
+        EXTRACT_EGU64x4_WORDS_BE(vd, c, d, g, h);
+        // {a, b, e, f}  <- vs2
+        EXTRACT_EGU64x4_WORDS_BE(vs2, a, b, e, f);
+        // {kw3, kw2, kw1, kw0} <- vs1.  "kw" stands for K+W
+        EXTRACT_EGU64x4_WORDS_BE(vs1, UNUSED _unused_kw3, UNUSED _unused_kw2,
+                                 kw1, kw0);
+
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw0);
+        ZVK_SHA512_COMPRESS(a, b, c, d, e, f, g, h, kw1);
+
+        // Update the destination register, vd <- {a, b, e, f}.
+        SET_EGU64x4_BE(vd, a, b, e, f);
+      }
+    );
+    break;
+  }
+
+  // 'require_vsha2_common_constraints' ensures that
+  // VSEW is either 32 or 64.
+  default:
+    require(false);
+}
+
diff --git a/riscv/insns/vsha2ms_vv.h b/riscv/insns/vsha2ms_vv.h
new file mode 100644
index 0000000000..8f1ca085ae
--- /dev/null
+++ b/riscv/insns/vsha2ms_vv.h
@@ -0,0 +1,63 @@
+// vshams.vv vd, vs2, vs1
+
+#include "zvknh_ext_macros.h"
+
+// Ensures VSEW is 32 or 64, and vd doesn't overlap with either vs1 or vs2.
+require_vsha2_common_constraints;
+
+switch (P.VU.vsew) {
+  case e32: {
+    require_vsha2_vsew32_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU32x4_NOVM_LOOP(
+      {},
+      {
+        // {w3, w2, w1, w0} <- vd
+        EXTRACT_EGU32x4_WORDS_BE(vd, w3, w2, w1, w0);
+        // {w11, w10, w9, w4} <- vs2
+        EXTRACT_EGU32x4_WORDS_BE(vs2, w11, w10, w9, w4);
+        // {w15, w14, w13, w12} <- vs1
+        EXTRACT_EGU32x4_WORDS_BE(vs1, w15, w14, UNUSED _unused_w13, w12);
+
+        const uint32_t w16 = ZVK_SHA256_SCHEDULE(w14,  w9, w1, w0);
+        const uint32_t w17 = ZVK_SHA256_SCHEDULE(w15, w10, w2, w1);
+        const uint32_t w18 = ZVK_SHA256_SCHEDULE(w16, w11, w3, w2);
+        const uint32_t w19 = ZVK_SHA256_SCHEDULE(w17, w12, w4, w3);
+
+        // Update the destination register.
+        SET_EGU32x4_BE(vd, w19, w18, w17, w16);;
+      }
+    );
+    break;
+  }
+
+  case e64: {
+    require_vsha2_vsew64_constraints;
+
+    VI_ZVK_VD_VS1_VS2_EGU64x4_NOVM_LOOP(
+      {},
+      {
+        // {w3, w2, w1, w0} <- vd
+        EXTRACT_EGU64x4_WORDS_BE(vd, w3, w2, w1, w0);
+        // {w11, w10, w9, w4} <- vs2
+        EXTRACT_EGU64x4_WORDS_BE(vs2, w11, w10, w9, w4);
+        // {w15, w14, w13, w12} <- vs1
+        EXTRACT_EGU64x4_WORDS_BE(vs1, w15, w14, UNUSED _unused_w13, w12);
+
+        const uint64_t w16 = ZVK_SHA512_SCHEDULE(w14,  w9, w1, w0);
+        const uint64_t w17 = ZVK_SHA512_SCHEDULE(w15, w10, w2, w1);
+        const uint64_t w18 = ZVK_SHA512_SCHEDULE(w16, w11, w3, w2);
+        const uint64_t w19 = ZVK_SHA512_SCHEDULE(w17, w12, w4, w3);
+
+        // Update the destination register.
+        SET_EGU64x4_BE(vd, w19, w18, w17, w16);;
+      }
+    );
+    break;
+  }
+
+  // 'require_vsha2_common_constraints' ensures that
+  // VSEW is either 32 or 64.
+  default:
+    require(false);
+}
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 5562c0956d..4ce088f35f 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1368,10 +1368,17 @@ riscv_insn_ext_zvkg= \
 	vghsh_vv \
 	vgmul_vv \
 
+# Covers both Zvknha and Zvkhnb.
+riscv_insn_ext_zvknh = \
+	vsha2cl_vv \
+	vsha2ch_vv \
+	vsha2ms_vv \
+
 riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
 	$(riscv_insn_ext_zvbc) \
 	$(riscv_insn_ext_zvkg) \
+	$(riscv_insn_ext_zvknh) \
 
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
diff --git a/riscv/zvknh_ext_macros.h b/riscv/zvknh_ext_macros.h
new file mode 100644
index 0000000000..b50818bdae
--- /dev/null
+++ b/riscv/zvknh_ext_macros.h
@@ -0,0 +1,155 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvknh[ab] extensions (vector SHA-256/SHA-512 cryptography).
+
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_ZVKNH_EXT_MACROS_H_
+#define RISCV_ZVKNH_EXT_MACROS_H_
+
+// Constraints common to all vsha* instructions, across all VSEW:
+//  - VSEW is 32 (SHA-256) or 64 (SHA-512)
+//  - No overlap of vd with vs1 or vs2.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_..._EGU32x4_..._LOOP and VI_..._EGU64x4_..._LOOP
+// macros.
+#define require_vsha2_common_constraints \
+  do { \
+    require(P.VU.vsew == 32 || P.VU.vsew == 64); \
+    require(insn.rd() != insn.rs1()); \
+    require(insn.rd() != insn.rs2()); \
+  } while (false)
+
+// Constraints on vsha2 instructions that must be verified when VSEW==32.
+// Those are *IN ADDITION* to the constraints checked by
+// 'require_vsha2_common_constraints', which is meant to be run earlier.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vsha2_vsew32_constraints \
+  do { \
+    require_zvknh_256; \
+    require_egw_fits(128); \
+  } while (false)
+
+// Constraints on vsha2 instructions that must be verified when VSEW==32.
+// Those are *IN ADDITION* to the constraints checked by
+// 'require_vsha2_common_constraints', which is meant to be run earlier.
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU64x4_..._LOOP macros.
+#define require_vsha2_vsew64_constraints \
+  do { \
+    require_zvknh_512; \
+    require_egw_fits(256); \
+  } while (false)
+
+//
+// SHA-256 and SHA-512 common logic
+//
+
+// Ch(x, y, z) = (xy) ⊕ (~xz) = xy | ~xz
+#define ZVK_SHA_CH(X, Y, Z) (((X) & (Y)) ^ ((~(X)) & (Z)))
+
+// Maj(x,y,z)  = (xy) ⊕ (xz) ⊕(yz) = xy | xz | yz
+#define ZVK_SHA_MAJ(X, Y, Z) (((X) & (Y)) ^ ((X) & (Z)) ^ ((Y) & (Z)))
+
+//
+// SHA-256
+//
+
+// sum0(x) = ROTR2(x) ⊕ ROTR13(x) ⊕ ROTR22(x)
+#define ZVK_SHA256_SUM0(X) \
+  (ZVK_ROR32(X, 2) ^ ZVK_ROR32(X, 13) ^ ZVK_ROR32(X, 22))
+
+// sum1(x) = ROTR6(x) ⊕ ROTR11(x) ⊕ ROTR25(x)
+#define ZVK_SHA256_SUM1(X) \
+  (ZVK_ROR32(X, 6) ^ ZVK_ROR32(X, 11) ^ ZVK_ROR32(X, 25))
+
+// sig0(x) = ROTR7(x) ⊕ ROTR18(x) ⊕ SHR3 (x)
+#define ZVK_SHA256_SIG0(X) \
+  (ZVK_ROR32(X, 7) ^ ZVK_ROR32(X, 18) ^ ((X) >> 3))
+
+// sig1(x) = ROTR17(x) ⊕ ROTR19(x) ⊕ SHR10(x)
+#define ZVK_SHA256_SIG1(X)  \
+  (ZVK_ROR32(X, 17) ^ ZVK_ROR32(X, 19) ^ ((X) >> 10))
+
+// Given the schedule words W[t+0], W[t+1], W[t+9], W[t+14], computes
+// W[t+16].
+#define ZVK_SHA256_SCHEDULE(W14, W9, W1, W0) \
+    (ZVK_SHA256_SIG1(W14) + (W9) + ZVK_SHA256_SIG0(W1) + (W0))
+
+// Performs one round of compression (out of the 64 rounds), given the state
+// temporaries A,B,C,...,H, and KW, the sum Kt+Wt.
+// Updates A,B,C,...,H to their new values. KW is not modified.
+//
+// Note that some of the logic could be omitted in vshac[ab] since
+// some of the variables are dropped in each of those. However removing
+// those unnecessary updates reduces the opportunities to share this single
+// per-round logic and forces us to move further away from the how the logic
+// is expressed in FIPS PUB 180-4.
+#define ZVK_SHA256_COMPRESS(A, B, C, D, E, F, G, H, KW) \
+  { \
+    const uint32_t t1 = (H) + ZVK_SHA256_SUM1(E) + \
+                        ZVK_SHA_CH((E), (F), (G)) + (KW); \
+    const uint32_t t2 = ZVK_SHA256_SUM0(A) + ZVK_SHA_MAJ((A), (B), (C)); \
+    (H) = (G); \
+    (G) = (F); \
+    (F) = (E); \
+    (E) = (D) + t1; \
+    (D) = (C); \
+    (C) = (B); \
+    (B) = (A); \
+    (A) = t1 + t2; \
+  }
+
+//
+// SHA-512
+//
+
+// sum0(x) = ROTR2(x) ⊕ ROTR13(x) ⊕ ROTR22(x)
+#define ZVK_SHA512_SUM0(X) \
+  (ZVK_ROR64(X, 28) ^ ZVK_ROR64(X, 34) ^ ZVK_ROR64(X, 39))
+
+// sum1(x) = ROTR6(x) ⊕ ROTR11(x) ⊕ ROTR25(x)
+#define ZVK_SHA512_SUM1(X) \
+  (ZVK_ROR64(X, 14) ^ ZVK_ROR64(X, 18) ^ ZVK_ROR64(X, 41))
+
+// sig0(x) = ROTR7(x) ⊕ ROTR18(x) ⊕ SHR3 (x)
+#define ZVK_SHA512_SIG0(X) \
+  (ZVK_ROR64(X, 1) ^ ZVK_ROR64(X, 8) ^ ((X) >> 7))
+
+// sig1(x) = ROTR17(x) ⊕ ROTR19(x) ⊕ SHR10(x)
+#define ZVK_SHA512_SIG1(X) \
+  (ZVK_ROR64(X, 19) ^ ZVK_ROR64(X, 61) ^ ((X) >> 6))
+
+// Given the schedule words W[t+0], W[t+1], W[t+9], W[t+14], computes
+// W[t+16].
+#define ZVK_SHA512_SCHEDULE(W14, W9, W1, W0) \
+    (ZVK_SHA512_SIG1(W14) + (W9) + ZVK_SHA512_SIG0(W1) + (W0))
+
+// Performs one round of compression (out of the 64 rounds), given the state
+// temporaries A,B,C,...,H, and KW, the sum Kt+Wt.
+// Updates A,B,C,...,H to their new values. KW is not modified.
+//
+// Note that some of the logic could be omitted in vshac[ab] since
+// some of the variables are dropped in each of those. However removing
+// those unnecessary updates reduces the opportunities to share this single
+// per-round logic and forces us to move further away from the how the logic
+// is expressed in FIPS PUB 180-4.
+#define ZVK_SHA512_COMPRESS(A, B, C, D, E, F, G, H, KW) \
+  { \
+    const uint64_t t1 = (H) + ZVK_SHA512_SUM1(E) + \
+                        ZVK_SHA_CH((E), (F), (G)) + (KW); \
+    const uint64_t t2 = ZVK_SHA512_SUM0(A) + ZVK_SHA_MAJ((A), (B), (C)); \
+    (H) = (G); \
+    (G) = (F); \
+    (F) = (E); \
+    (E) = (D) + t1; \
+    (D) = (C); \
+    (C) = (B); \
+    (B) = (A); \
+    (A) = t1 + t2; \
+  }
+
+#endif  // RISCV_ZVKNH_EXT_MACROS_H_

From eadb0e1129c23e709b0565740f0fc1a3359de7b7 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:07:38 -0700
Subject: [PATCH 082/110] Zvk: Implement Zvkned, vector AES single round

Implement the Zvkned extension, "NIST Suite: Vector AES Encryption
& Decryption (Single Round)".
 - vaeskf1.vi: AES forward key scheduling, AES-128.
 - vaeskf2.vi: AES forward key scheduling, AES-256.
 - vaesz.vs: AES encryption/decryption, 0-th round.
 - vaesdm.{vs,vv}: AES decryption, middle rounds.
 - vaesdf.{vs,vv}: AES decryption, final round.
 - vaesem.{vs,vv}: AES encryption, middle rounds.
 - vaesef.{vs,vv}: AES encryption, final round.

An extension specific header containing common logic is added.

Co-authored-by: Stanislaw Kardach <kda@semihalf.com>
Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/insns/vaesdf_vs.h   |  43 ++++++
 riscv/insns/vaesdf_vv.h   |  37 ++++++
 riscv/insns/vaesdm_vs.h   |  44 +++++++
 riscv/insns/vaesdm_vv.h   |  38 ++++++
 riscv/insns/vaesef_vs.h   |  43 ++++++
 riscv/insns/vaesef_vv.h   |  37 ++++++
 riscv/insns/vaesem_vs.h   |  44 +++++++
 riscv/insns/vaesem_vv.h   |  38 ++++++
 riscv/insns/vaeskf1_vi.h  |  65 +++++++++
 riscv/insns/vaeskf2_vi.h  |  89 +++++++++++++
 riscv/insns/vaesz_vs.h    |  24 ++++
 riscv/riscv.mk.in         |  14 ++
 riscv/zvkned_ext_macros.h | 270 ++++++++++++++++++++++++++++++++++++++
 13 files changed, 786 insertions(+)
 create mode 100644 riscv/insns/vaesdf_vs.h
 create mode 100644 riscv/insns/vaesdf_vv.h
 create mode 100644 riscv/insns/vaesdm_vs.h
 create mode 100644 riscv/insns/vaesdm_vv.h
 create mode 100644 riscv/insns/vaesef_vs.h
 create mode 100644 riscv/insns/vaesef_vv.h
 create mode 100644 riscv/insns/vaesem_vs.h
 create mode 100644 riscv/insns/vaesem_vv.h
 create mode 100644 riscv/insns/vaeskf1_vi.h
 create mode 100644 riscv/insns/vaeskf2_vi.h
 create mode 100644 riscv/insns/vaesz_vs.h
 create mode 100644 riscv/zvkned_ext_macros.h

diff --git a/riscv/insns/vaesdf_vs.h b/riscv/insns/vaesdf_vs.h
new file mode 100644
index 0000000000..a124278477
--- /dev/null
+++ b/riscv/insns/vaesdf_vs.h
@@ -0,0 +1,43 @@
+// vaesdf.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd does receive the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, scalar_key);
+    // InvMixColumns is not performed in the final round.
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesdf_vv.h b/riscv/insns/vaesdf_vv.h
new file mode 100644
index 0000000000..9fca5722fb
--- /dev/null
+++ b/riscv/insns/vaesdf_vv.h
@@ -0,0 +1,37 @@
+// vaesdf.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd in contains the input state,
+    //  - vs2 contains the input round key,
+    //  - vd out receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, round_key);
+    // InvMixColumns is not performed in the final round.
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesdm_vs.h b/riscv/insns/vaesdm_vs.h
new file mode 100644
index 0000000000..3c23e69e93
--- /dev/null
+++ b/riscv/insns/vaesdm_vs.h
@@ -0,0 +1,44 @@
+// vaesdm.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd in contains the input state,
+    //  - vs2 contains the input round key,
+    //  - vd out receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, scalar_key);
+    // InvMixColumns
+    VAES_INV_MIX_COLUMNS(aes_state);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesdm_vv.h b/riscv/insns/vaesdm_vv.h
new file mode 100644
index 0000000000..9c29cd965e
--- /dev/null
+++ b/riscv/insns/vaesdm_vv.h
@@ -0,0 +1,38 @@
+// vaesdm.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd does receive the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // InvShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_INV_SHIFT_ROWS(aes_state);
+    // InvSubBytes - Apply S-box to every byte in the state
+    VAES_INV_SUB_BYTES(aes_state);
+    // AddRoundKey (which is also InvAddRoundKey as it's xor)
+    EGU8x16_XOREQ(aes_state, round_key);
+    // InvMixColumns
+    VAES_INV_MIX_COLUMNS(aes_state);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesef_vs.h b/riscv/insns/vaesef_vs.h
new file mode 100644
index 0000000000..2d32653345
--- /dev/null
+++ b/riscv/insns/vaesef_vs.h
@@ -0,0 +1,43 @@
+// vaesef.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns is not performed for the final round.
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, scalar_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesef_vv.h b/riscv/insns/vaesef_vv.h
new file mode 100644
index 0000000000..9b43a6d213
--- /dev/null
+++ b/riscv/insns/vaesef_vv.h
@@ -0,0 +1,37 @@
+// vaesef.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns is not performed for the final round.
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, round_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesem_vs.h b/riscv/insns/vaesem_vs.h
new file mode 100644
index 0000000000..348cd9f83f
--- /dev/null
+++ b/riscv/insns/vaesem_vs.h
@@ -0,0 +1,44 @@
+// vaesem.vs vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns
+    VAES_MIX_COLUMNS(aes_state);
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, scalar_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaesem_vv.h b/riscv/insns/vaesem_vv.h
new file mode 100644
index 0000000000..34f0056590
--- /dev/null
+++ b/riscv/insns/vaesem_vv.h
@@ -0,0 +1,38 @@
+// vaesem.vv vd, vs2
+
+#include "zvkned_ext_macros.h"
+#include "zvk_ext_macros.h"
+
+require_vaes_vv_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  {},  // No PRELOOP.
+  {
+    // For AES128, AES192, or AES256, state and key are 128b/16B values:
+    //  - vd contains the input state,
+    //  - vs2 contains the round key,
+    //  - vd receives the output state.
+    //
+    // While the spec calls for handling the vector as made of EGU32x4
+    // element groups (i.e., 4 uint32_t), it is convenient to treat
+    // AES state and key as EGU8x16 (i.e., 16 uint8_t). This is why
+    // we extract the operands here instead of using the existing LOOP
+    // macro that defines/extracts the operand variables as EGU32x4.
+    EGU8x16_t aes_state = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg);
+    const EGU8x16_t round_key = P.VU.elt_group<EGU8x16_t>(vs2_num, idx_eg);
+
+    // SubBytes - Apply S-box to every byte in the state
+    VAES_SUB_BYTES(aes_state);
+    // ShiftRows - Rotate each row bytes by 0, 1, 2, 3 positions.
+    VAES_SHIFT_ROWS(aes_state);
+    // MixColumns
+    VAES_MIX_COLUMNS(aes_state);
+    // AddRoundKey
+    EGU8x16_XOREQ(aes_state, round_key);
+
+    // Update the destination register.
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    EGU8x16_COPY(vd, aes_state);
+  }
+);
diff --git a/riscv/insns/vaeskf1_vi.h b/riscv/insns/vaeskf1_vi.h
new file mode 100644
index 0000000000..28d03d03b1
--- /dev/null
+++ b/riscv/insns/vaeskf1_vi.h
@@ -0,0 +1,65 @@
+// vaeskf1.vi vd, vs2, rnd
+
+#include "zvk_ext_macros.h"
+#include "zvkned_ext_macros.h"
+
+require_vaeskf_vi_constraints;
+
+// There is one round constant for each round number
+// between 1 and 10. We index using 'round# -1'.
+static constexpr uint8_t kRoundConstants[10] = {
+  0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
+};
+
+// For AES128, AES192, or AES256, keys (and state) are handled as
+// 128b/16B values.
+//
+// The Zvkned spec calls for handling the vector as made of EGU32x4
+// element groups (i.e., 4 uint32_t), and FIPS-197 AES specification
+// describes the key expansion in terms of manipulations of 32 bit
+// words, so using the EGU32x4 is natural.
+//
+VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
+  {},
+  // The following statements will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the 'round' variable
+  // declared and defined here  here to be visible in the loop block.
+  // Only consider the bottom 4 bits of the immediate.
+  const reg_t zimm4 = zimm5 & 0xF;
+  // Normalize the round value to be in [2, 14] by toggling bit 3
+  // if outside the range (i.e., +8 or -8).
+  const reg_t round = ((1 <= zimm4) && (zimm4 <= 10)) ? zimm4 : (zimm4 ^ 0x8);
+  const uint32_t rcon = kRoundConstants[round - 1];,
+  // Per Element Group body.
+  {
+    // vaeskf1_vi produces key[i+1] in vd, it receives key[i] in vs2,
+    // i.e., 4x32b values (4 words).
+    //
+    // The logic is fairly similar between vaeskf1/vaeskf2, with the following
+    // differences:
+    // - in AES-128 (vaeskf1), we get both the 'temp' word and
+    //   the "previous words" w0..w3 from key[i]/vs2.
+    // - in AES-256 (vaeskf2), we get 'temp' from key[i]/vs2, and
+    //   the "previous words" w0..w3 from key[i-1]/vd.
+
+    // 'temp' is extracted from the last (most significant) word of key[i].
+    uint32_t temp = vs2[3];
+    temp = (temp >> 8) | (temp << 24);  // Rotate right by 8
+    temp = (((uint32_t)AES_ENC_SBOX[(temp >> 24) & 0xFF] << 24) |
+            ((uint32_t)AES_ENC_SBOX[(temp >> 16) & 0xFF] << 16) |
+            ((uint32_t)AES_ENC_SBOX[(temp >>  8) & 0xFF] <<  8) |
+            ((uint32_t)AES_ENC_SBOX[(temp >>  0) & 0xFF] <<  0));
+    temp = temp ^ rcon;
+
+    // "old" words are the w[i-Nk] of FIPS-197. They are extracted
+    // from vs2, which contains key[i] in AES-128 where Nk=4.
+    const uint32_t w0 = vs2[0] ^ temp;
+    const uint32_t w1 = vs2[1] ^ w0;
+    const uint32_t w2 = vs2[2] ^ w1;
+    const uint32_t w3 = vs2[3] ^ w2;
+
+    // Overwrite vd with k[i+1] from the new words.
+    SET_EGU32x4_LE(vd, w0, w1, w2, w3);
+  }
+);
diff --git a/riscv/insns/vaeskf2_vi.h b/riscv/insns/vaeskf2_vi.h
new file mode 100644
index 0000000000..49c2a2db02
--- /dev/null
+++ b/riscv/insns/vaeskf2_vi.h
@@ -0,0 +1,89 @@
+// vaeskf2.vi vd, vs2, rnd
+
+#include "zvk_ext_macros.h"
+#include "zvkned_ext_macros.h"
+
+require_vaeskf_vi_constraints;
+
+// Round Constants
+//
+// Only the odd rounds need to be encoded, the even ones can use 0
+// or skip the rcon handling. We can use '(round# / 2) - 1'
+// (or "(round# >> 1) - 1") to index into the array.
+//
+// Round#  Constant
+//  [ 2]  -> kRoundConstants[0]
+//  [ 3]  -> 0 / Nothing
+//  [ 4]  -> kRoundConstants[1]
+//  [ 5]  -> 0 / Nothing
+//  [ 6]  -> kRoundConstants[2]
+//  [ 7]  -> 0 / Nothing
+// ...
+//  [13]  -> 0 / Nothing
+//  [14]  -> kRoundConstants[6]
+static constexpr uint8_t kRoundConstants[7] = {
+  0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40,
+};
+
+// For AES128, AES192, or AES256, keys (and state) are handled as
+// 128b/16B values.
+//
+// The Zvkned spec calls for handling the vector as made of EGU32x4
+// element groups (i.e., 4 uint32_t), and FIPS-197 AES specification
+// describes the key expansion in terms of manipulations of 32 bit
+// words, so using the EGU32x4 is natural.
+//
+VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
+  {},
+  // The following statements will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the 'round' variable
+  // declared and defined here  here to be visible in the loop block.
+  // Only consider the bottom 4 bits of the immediate.
+  const reg_t zimm4 = zimm5 & 0xF;
+  // Normalize the round value to be in [2, 14] by toggling bit 3
+  // if outside the range (i.e., +8 or -8).
+  const reg_t round = ((2 <= zimm4) && (zimm4 <= 14)) ? zimm4 : (zimm4 ^ 0x8);,
+  // Per Element Group body.
+  {
+    // vaeskf2_vi produces key[i+1] in vd, it receives key[i] in vs2,
+    // i.e., 4x32b values (4 words).
+    //
+    // The logic is fairly similar between vaeskf2/vaeskf2, with the following
+    // differences:
+    // - in AES-128 (vaeskf1), we get both the 'temp' word and
+    //   the "previous words" w0..w3 from key[i]/vs2.
+    // - in AES-256 (vaeskf2), we get 'temp' from key[i]/vs2, and
+    //   the "previous words" w0..w3 from key[i-1]/vd.
+
+    // 'temp' is extracted from the last (most significant) word of key[i].
+    uint32_t temp = vs2[3];
+    // With AES-256, when we have an even round number, we hit the
+    //       Nk > 6 and i mod Nk = 4
+    // condition in the FIPS-197 key expansion pseudo-code (Figure 11).
+    // In those cases we skip RotWord and the round constant is 0.
+    const bool is_even_round = (round & 0x1) == 0;
+    if (is_even_round) {
+      temp = (temp >> 8) | (temp << 24);  // Rotate right by 8
+    }
+    temp = (((uint32_t)AES_ENC_SBOX[(temp >> 24) & 0xFF] << 24) |
+            ((uint32_t)AES_ENC_SBOX[(temp >> 16) & 0xFF] << 16) |
+            ((uint32_t)AES_ENC_SBOX[(temp >>  8) & 0xFF] <<  8) |
+            ((uint32_t)AES_ENC_SBOX[(temp >>  0) & 0xFF] <<  0));
+
+    if (is_even_round) {
+      const uint32_t rcon = kRoundConstants[(round >> 1) - 1];
+      temp = temp ^ rcon;
+    }
+
+    // "old" words are the w[i-Nk] of FIPS-197. For AES-256, where Nk=8,
+    // they are extracted from vd which contains key[i-1].
+    const uint32_t w0 = vd[0] ^ temp;
+    const uint32_t w1 = vd[1] ^ w0;
+    const uint32_t w2 = vd[2] ^ w1;
+    const uint32_t w3 = vd[3] ^ w2;
+
+    // Overwrite vd with k[i+1] from the new words.
+    SET_EGU32x4_LE(vd, w0, w1, w2, w3);
+  }
+);
diff --git a/riscv/insns/vaesz_vs.h b/riscv/insns/vaesz_vs.h
new file mode 100644
index 0000000000..c3dc931c93
--- /dev/null
+++ b/riscv/insns/vaesz_vs.h
@@ -0,0 +1,24 @@
+// vaesz.vs vd, vs2
+
+#include "zvk_ext_macros.h"
+#include "zvkned_ext_macros.h"
+
+require_vaes_vs_constraints;
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU8x16_t scalar_key = P.VU.elt_group<EGU8x16_t>(vs2_num, 0);,
+  // Per Element Group body.
+  {
+    EGU8x16_t &vd = P.VU.elt_group<EGU8x16_t>(vd_num, idx_eg, true);
+    // Produce vd = vd ^ "common key from vs2".
+    EGU8x16_XOR(vd, vd, scalar_key);
+  }
+);
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 4ce088f35f..2d75662101 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1368,6 +1368,19 @@ riscv_insn_ext_zvkg= \
 	vghsh_vv \
 	vgmul_vv \
 
+riscv_insn_ext_zvkned = \
+	vaesdf_vs \
+	vaesdf_vv \
+	vaesdm_vs \
+	vaesdm_vv \
+	vaesef_vs \
+	vaesef_vv \
+	vaesem_vs \
+	vaesem_vv \
+	vaeskf1_vi \
+	vaeskf2_vi \
+	vaesz_vs \
+
 # Covers both Zvknha and Zvkhnb.
 riscv_insn_ext_zvknh = \
 	vsha2cl_vv \
@@ -1378,6 +1391,7 @@ riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
 	$(riscv_insn_ext_zvbc) \
 	$(riscv_insn_ext_zvkg) \
+	$(riscv_insn_ext_zvkned) \
 	$(riscv_insn_ext_zvknh) \
 
 riscv_insn_list = \
diff --git a/riscv/zvkned_ext_macros.h b/riscv/zvkned_ext_macros.h
new file mode 100644
index 0000000000..db705c71e5
--- /dev/null
+++ b/riscv/zvkned_ext_macros.h
@@ -0,0 +1,270 @@
+// Helper macros to help implement instructions defined as part of
+// the RISC-V Zvkned extension (vector AES single round).
+
+#include "insns/aes_common.h"
+
+#ifndef RISCV_ZVKNED_EXT_MACROS_H_
+#define RISCV_ZVKNED_EXT_MACROS_H_
+
+// vaes*.vs instruction constraints:
+//  - Zvkned is enabled
+//  - EGW (128) <= LMUL * VLEN
+//  - vd and vs2 cannot overlap
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vaes_vs_constraints \
+  do { \
+    require_zvkned; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(128); \
+    require(insn.rd() != insn.rs2()); \
+  } while (false)
+
+// vaes*.vv instruction constraints. Those are the same as the .vs ones,
+// except for the overlap constraint that is not present for .vv variants.
+//  - Zvkned is enabled
+//  - EGW (128) <= LMUL * VLEN
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vaes_vv_constraints \
+  do { \
+    require_zvkned; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(128); \
+  } while (false)
+
+// vaeskf*.vi instruction constraints. Those are the same as the .vv ones.
+#define require_vaeskf_vi_constraints \
+  do { \
+    require_zvkned; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(128); \
+  } while (false)
+
+#define VAES_XTIME(A) (((A) << 1) ^ (((A) & 0x80) ? 0x1b : 0))
+
+#define VAES_GFMUL(A, B) \
+  ((((B) & 0x1) ?                                  (A)  : 0) ^ \
+   (((B) & 0x2) ?                         VAES_XTIME(A) : 0) ^ \
+   (((B) & 0x4) ?             VAES_XTIME(VAES_XTIME(A)) : 0) ^ \
+   (((B) & 0x8) ? VAES_XTIME(VAES_XTIME(VAES_XTIME(A))) : 0))
+
+// Apply the S-box transform to every byte in the VAESState 'state'
+#define VAES_SUB_BYTES(STATE) \
+  do { \
+    static constexpr uint8_t kVAESXEncSBox[256]= { \
+      0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, \
+      0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, \
+      0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, \
+      0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, \
+      0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, \
+      0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, \
+      0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, \
+      0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, \
+      0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, \
+      0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, \
+      0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, \
+      0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, \
+      0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, \
+      0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, \
+      0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, \
+      0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, \
+      0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, \
+      0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, \
+      0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, \
+      0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, \
+      0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, \
+      0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, \
+      0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, \
+      0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, \
+      0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, \
+      0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, \
+      0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, \
+      0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, \
+      0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, \
+      0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, \
+      0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, \
+      0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16, \
+    }; \
+    for (uint8_t& byte : (STATE)) { \
+      byte = kVAESXEncSBox[byte]; \
+     } \
+  } while (0)
+
+// Applies the S-box inverse (decode) transform to every byte
+// in the VAESState 'state'.
+#define VAES_INV_SUB_BYTES(STATE) \
+  do { \
+    static constexpr uint8_t kVAESXDecSBox[256] = { \
+      0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, \
+      0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB, \
+      0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, \
+      0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, \
+      0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D, \
+      0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E, \
+      0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, \
+      0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25, \
+      0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, \
+      0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, \
+      0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA, \
+      0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84, \
+      0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, \
+      0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06, \
+      0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, \
+      0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, \
+      0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA, \
+      0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73, \
+      0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, \
+      0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E, \
+      0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, \
+      0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, \
+      0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20, \
+      0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4, \
+      0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, \
+      0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F, \
+      0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, \
+      0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, \
+      0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0, \
+      0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61, \
+      0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, \
+      0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D, \
+    }; \
+    for (uint8_t &byte : (STATE)) { \
+      byte = kVAESXDecSBox[byte]; \
+    } \
+  } while (0)
+
+// Shift the state rows, as specified in ShiftRows.
+//  'STATE' is a VAESState value.
+#define VAES_SHIFT_ROWS(STATE) \
+  do { \
+    uint8_t temp; \
+    /* Row 0 (byte indices 0, 4, 8, 12) does not rotate. */ \
+    /* Row 1 (byte indices 1, 5, 9, 13) rotates left by 1 position. */ \
+    temp = (STATE)[1]; \
+    (STATE)[ 1] = (STATE)[ 5]; \
+    (STATE)[ 5] = (STATE)[ 9]; \
+    (STATE)[ 9] = (STATE)[13]; \
+    (STATE)[13] = temp; \
+    /* Row 2 (byte indices 2, 6, 10, 14) rotates by 2 positions. */ \
+    temp = (STATE)[2]; \
+    (STATE)[ 2] = (STATE)[10]; \
+    (STATE)[10] = temp; \
+    temp = (STATE)[6]; \
+    (STATE)[ 6] = (STATE)[14]; \
+    (STATE)[14] = temp; \
+    /* Row 3 (byte indices 3, 7, 11, 15) rotates by 3 position (or -1). */ \
+    temp = (STATE)[3]; \
+    (STATE)[ 3] = (STATE)[15]; \
+    (STATE)[15] = (STATE)[11]; \
+    (STATE)[11] = (STATE)[ 7]; \
+    (STATE)[ 7] = temp; \
+  } while (0)
+
+// Shifts the state rows, as specified in InvShiftRows.
+// 'STATE' is a VAESState value.
+#define VAES_INV_SHIFT_ROWS(STATE) \
+  do { \
+    uint8_t temp; \
+    /* Row 0 (byte indices 0, 4, 8, 12) does not rotate. */ \
+    /* Row 1 (byte indices 1, 5, 9, 13) rotates left by 1 position. */ \
+    temp = (STATE)[1]; \
+    (STATE)[ 1] = (STATE)[13]; \
+    (STATE)[13] = (STATE)[ 9]; \
+    (STATE)[ 9] = (STATE)[ 5]; \
+    (STATE)[ 5] = temp; \
+    /* Row 2 (byte indices 2, 6, 10, 14) rotates by 2 positions. */ \
+    temp = (STATE)[2]; \
+    (STATE)[ 2] = (STATE)[10]; \
+    (STATE)[10] = temp; \
+    temp = (STATE)[6]; \
+    (STATE)[ 6] = (STATE)[14]; \
+    (STATE)[14] = temp; \
+    /* Row 3 (byte indices 3, 7, 11, 15) rotates by 3 position (or -1). */ \
+    temp = (STATE)[3]; \
+    (STATE)[ 3] = (STATE)[ 7]; \
+    (STATE)[ 7] = (STATE)[11]; \
+    (STATE)[11] = (STATE)[15]; \
+    (STATE)[15] = temp; \
+  } while (0)
+
+// Implements the function producing one byte, one-fourth of the column
+// transformation MixColumns() specified in FIPS-197 5.1.3 .
+//
+// The arguments are all bytes (i.e., uint8_t). The function implemented
+// is
+//   F(A, B, C, D) = (2 . A) xor (3 . B) xor C xor D
+// where '.' denotes the Galois Field multiplication over 2**8.
+//
+#define VAES_MIX_COLUMN_BYTE(A, B, C, D) \
+  (VAES_GFMUL((A), 0x2) ^ VAES_GFMUL((B), 0x3) ^ (C) ^ (D))
+
+// Implements the function producing one byte, one-fourth of the column
+// transformation InvMixColumns() specified in FIPS-197 5.3.3 .
+//
+// The arguments are all bytes (i.e., uint8_t). The function implemented
+// is
+//   F(A, B, C, D) = (0xE . A) xor (0xB . B) xor (0xD . C) xor (0x9 . D)
+// where '.' denotes the Galois Field multiplication over 2**8.
+//
+#define VAES_INV_MIX_COLUMN_BYTE(A, B, C, D) \
+  (VAES_GFMUL((A), 0xE) ^ \
+   VAES_GFMUL((B), 0xB) ^ \
+   VAES_GFMUL((C), 0xD) ^ \
+   VAES_GFMUL((D), 0x9))
+
+// Given a column as a uin32_t (4 Bytes), produces the mixed column
+// as a uin32_t.
+#define VAES_MIX_COLUMN(STATE, COL_IDX) \
+  do { \
+    uint8_t *column = &(STATE)[(COL_IDX) * 4]; \
+    /* Extract the bytes, before we start overwriting them */ \
+    const uint8_t b0 = column[0]; \
+    const uint8_t b1 = column[1]; \
+    const uint8_t b2 = column[2]; \
+    const uint8_t b3 = column[3]; \
+    /* Every iteration rotates the byte indices by 1 */ \
+    column[0] = VAES_MIX_COLUMN_BYTE(b0, b1, b2, b3); \
+    column[1] = VAES_MIX_COLUMN_BYTE(b1, b2, b3, b0); \
+    column[2] = VAES_MIX_COLUMN_BYTE(b2, b3, b0, b1); \
+    column[3] = VAES_MIX_COLUMN_BYTE(b3, b0, b1, b2); \
+  } while (0)
+
+// Given a column as a uin32_t (4 Bytes), produces the inverse
+// mixed column as a uin32_t.
+#define VAES_INV_MIX_COLUMN(STATE, COL_IDX) \
+  do { \
+    uint8_t *column = &(STATE)[(COL_IDX) * 4]; \
+    /* Extract the bytes, before we start overwriting them */ \
+    const uint8_t b0 = column[0]; \
+    const uint8_t b1 = column[1]; \
+    const uint8_t b2 = column[2]; \
+    const uint8_t b3 = column[3]; \
+    /* Every iteration rotates the byte indices by 1 */ \
+    column[0] = VAES_INV_MIX_COLUMN_BYTE(b0, b1, b2, b3); \
+    column[1] = VAES_INV_MIX_COLUMN_BYTE(b1, b2, b3, b0); \
+    column[2] = VAES_INV_MIX_COLUMN_BYTE(b2, b3, b0, b1); \
+    column[3] = VAES_INV_MIX_COLUMN_BYTE(b3, b0, b1, b2); \
+  } while (0)
+
+// Implements MixColumns as defined in FIPS-197 5.1.3.
+#define VAES_MIX_COLUMNS(STATE) \
+  do { \
+    VAES_MIX_COLUMN((STATE), 0); \
+    VAES_MIX_COLUMN((STATE), 1); \
+    VAES_MIX_COLUMN((STATE), 2); \
+    VAES_MIX_COLUMN((STATE), 3); \
+  } while (0)
+
+// Implements InvMixColumns as defined in FIPS-197 5.3.3.
+#define VAES_INV_MIX_COLUMNS(STATE) \
+  do { \
+    VAES_INV_MIX_COLUMN((STATE), 0); \
+    VAES_INV_MIX_COLUMN((STATE), 1); \
+    VAES_INV_MIX_COLUMN((STATE), 2); \
+    VAES_INV_MIX_COLUMN((STATE), 3); \
+  } while (0)
+
+#endif  // RISCV_ZVKNED_EXT_MACROS_H_

From cbb2b1a224d8922c6d3146da56f5087a3858ced5 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:07:53 -0700
Subject: [PATCH 083/110] Zvk: Implement Zvksed, vector SM4 Block Cipher
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the Zvksed sub-extension, "ShangMi Suite: SM4 Block Cipher":
 - vsm4k.vi, vector SM4 key expansion,
 - vsm4r.{vs,vv}, vector SM4 rounds.

This also introduces a header for common vector SM4 logic.

Co-authored-by: Raghav Gupta <rgupta@rivosinc.com>
Co-authored-by: Albert Jakieła <aja@semihalf.com>
Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/insns/sm4_common.h  |  1 -
 riscv/insns/vsm4k_vi.h    | 52 +++++++++++++++++++++++++++++++++
 riscv/insns/vsm4r_vs.h    | 51 +++++++++++++++++++++++++++++++++
 riscv/insns/vsm4r_vv.h    | 37 ++++++++++++++++++++++++
 riscv/riscv.mk.in         |  6 ++++
 riscv/zvksed_ext_macros.h | 60 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 206 insertions(+), 1 deletion(-)
 create mode 100644 riscv/insns/vsm4k_vi.h
 create mode 100644 riscv/insns/vsm4r_vs.h
 create mode 100644 riscv/insns/vsm4r_vv.h
 create mode 100644 riscv/zvksed_ext_macros.h

diff --git a/riscv/insns/sm4_common.h b/riscv/insns/sm4_common.h
index 17f129f0ad..24d6ce1d05 100644
--- a/riscv/insns/sm4_common.h
+++ b/riscv/insns/sm4_common.h
@@ -24,4 +24,3 @@ static const uint8_t sm4_sbox[256] = {
 	0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E,
 	0xD7, 0xCB, 0x39, 0x48
 };
-
diff --git a/riscv/insns/vsm4k_vi.h b/riscv/insns/vsm4k_vi.h
new file mode 100644
index 0000000000..8f52e68199
--- /dev/null
+++ b/riscv/insns/vsm4k_vi.h
@@ -0,0 +1,52 @@
+// vsm4k.vi vd, vs2, round#
+
+#include "zvksed_ext_macros.h"
+
+// SM4 Constant Key (CK) - section 7.3.2. of the IETF draft.
+static constexpr uint32_t zvksed_ck[32] = {
+  0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269,
+  0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9,
+  0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249,
+  0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9,
+  0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229,
+  0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299,
+  0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209,
+  0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+};
+
+require_vsm4_constraints;
+
+VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
+  {},
+  // The following statements will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the 'round' variable
+  // declared and defined here  here to be visible in the loop block.
+  // Only consider the bottom 3 bits of the immediate, ensuring that
+  // 'round' is in the valid range [0, 7].
+  const reg_t round = zimm5 & 0x7;,
+  // Per Element Group body.
+  {
+    // {rk0, rk1, rk2, rk3} <- vs2
+    EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3);
+
+    uint32_t B = rk1 ^ rk2 ^ rk3 ^ zvksed_ck[4 * round];
+    uint32_t S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk4 = ZVKSED_ROUND_KEY(rk0, S);
+
+    B = rk2 ^ rk3 ^ rk4 ^ zvksed_ck[4 * round + 1];
+    S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk5 = ZVKSED_ROUND_KEY(rk1, S);
+
+    B = rk3 ^ rk4 ^ rk5 ^ zvksed_ck[4 * round + 2];
+    S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk6 = ZVKSED_ROUND_KEY(rk2, S);
+
+    B = rk4 ^ rk5 ^ rk6 ^ zvksed_ck[4 * round + 3];
+    S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk7 = ZVKSED_ROUND_KEY(rk3, S);
+
+    // Update the destination register.
+    SET_EGU32x4_LE(vd, rk4, rk5, rk6, rk7);
+  }
+);
diff --git a/riscv/insns/vsm4r_vs.h b/riscv/insns/vsm4r_vs.h
new file mode 100644
index 0000000000..44011eb544
--- /dev/null
+++ b/riscv/insns/vsm4r_vs.h
@@ -0,0 +1,51 @@
+// vsm4r.vs vd, vs2
+
+#include "zvksed_ext_macros.h"
+
+require_vsm4_constraints;
+// No overlap of vd and vs2.
+require(insn.rd() != insn.rs2());
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU32x4_t scalar_key = P.VU.elt_group<EGU32x4_t>(vs2_num, 0);
+  const uint32_t rk0 = scalar_key[0];
+  const uint32_t rk1 = scalar_key[1];
+  const uint32_t rk2 = scalar_key[2];
+  const uint32_t rk3 = scalar_key[3];,
+  {
+    EGU32x4_t &state = P.VU.elt_group<EGU32x4_t>(vd_num, idx_eg, true);
+
+    // {x0, x1,x2, x3} <- vd
+    EXTRACT_EGU32x4_WORDS_LE(state, x0, x1, x2, x3);
+
+    uint32_t B;
+    uint32_t S;
+
+    B = x1 ^ x2 ^ x3 ^ rk0;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x4 = ZVKSED_ROUND(x0, S);
+
+    B = x2 ^ x3 ^ x4 ^ rk1;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x5 = ZVKSED_ROUND(x1, S);
+
+    B = x3 ^ x4 ^ x5 ^ rk2;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x6 = ZVKSED_ROUND(x2, S);
+
+    B = x4 ^ x5 ^ x6 ^ rk3;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x7 = ZVKSED_ROUND(x3, S);
+
+    // Update the destination register.
+    SET_EGU32x4_LE(state, x4, x5, x6, x7);
+  }
+);
diff --git a/riscv/insns/vsm4r_vv.h b/riscv/insns/vsm4r_vv.h
new file mode 100644
index 0000000000..9a18cecee0
--- /dev/null
+++ b/riscv/insns/vsm4r_vv.h
@@ -0,0 +1,37 @@
+// vsm4r.vv vd, vs2
+
+#include "zvksed_ext_macros.h"
+
+require_vsm4_constraints;
+
+VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(
+  {},
+  {
+    // vd = {x0, x1,x2, x3} <- vd
+    EXTRACT_EGU32x4_WORDS_LE(vd, x0, x1, x2, x3);
+    // {rk0, rk1, rk2, rk3} <- vs2
+    EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3);
+
+    uint32_t B;
+    uint32_t S;
+
+    B = x1 ^ x2 ^ x3 ^ rk0;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x4 = ZVKSED_ROUND(x0, S);
+
+    B = x2 ^ x3 ^ x4 ^ rk1;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x5 = ZVKSED_ROUND(x1, S);
+
+    B = x3 ^ x4 ^ x5 ^ rk2;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x6 = ZVKSED_ROUND(x2, S);
+
+    B = x4 ^ x5 ^ x6 ^ rk3;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x7 = ZVKSED_ROUND(x3, S);
+
+    // Update the destination register.
+    SET_EGU32x4_LE(vd, x4, x5, x6, x7);
+  }
+);
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 2d75662101..c774e1bf28 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1387,12 +1387,18 @@ riscv_insn_ext_zvknh = \
 	vsha2ch_vv \
 	vsha2ms_vv \
 
+riscv_insn_ext_zvksed = \
+	vsm4k_vi \
+	vsm4r_vs \
+	vsm4r_vv \
+
 riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
 	$(riscv_insn_ext_zvbc) \
 	$(riscv_insn_ext_zvkg) \
 	$(riscv_insn_ext_zvkned) \
 	$(riscv_insn_ext_zvknh) \
+	$(riscv_insn_ext_zvksed) \
 
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
diff --git a/riscv/zvksed_ext_macros.h b/riscv/zvksed_ext_macros.h
new file mode 100644
index 0000000000..46e399b904
--- /dev/null
+++ b/riscv/zvksed_ext_macros.h
@@ -0,0 +1,60 @@
+// Helper macros and functions to help implement instructions defined as part of
+// the RISC-V Zvksed extension (vectorized SM4).
+
+#include "insns/sm4_common.h"
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_ZVKSED_MACROS_H_
+#define RISCV_ZVKSED_MACROS_H_
+
+// Constraints common to all vsm4* instructions:
+//  - Zvksed is enabled
+//  - VSEW == 32
+//  - EGW (128) <= LMUL * VLEN
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vsm4_constraints \
+  do { \
+    require_zvksed; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(128); \
+  } while (false)
+
+// Returns a uint32_t value constructed from the 4 bytes (uint8_t)
+// provided in "Little Endian" (LE) order, i.e., from least significant (B0)
+// to most significant (B3).
+#define ZVKSED_U32_FROM_U8_LE(B0, B1, B2, B3) \
+  (((uint32_t)(B0)) <<  0 | \
+   ((uint32_t)(B1)) <<  8 | \
+   ((uint32_t)(B2)) << 16 | \
+   ((uint32_t)(B3)) << 24)
+
+// Get byte BYTE of the SBox.
+#define ZVKSED_SBOX(BYTE)  (sm4_sbox[(BYTE)])
+
+// Given an unsigned integer value 'X' and a byte index,
+// returns a uint8_t value for the byte at the given index.
+#define ZVKSED_EXTRACT_U8(X, BYTE_IDX) ((uint8_t)((X) >> (BYTE_IDX * 8)))
+
+// Apply the nonlinear transformation tau to a 32 bit word B - section 6.2.1.
+// of the IETF draft.
+#define ZVKSED_SUB_BYTES(B) \
+  ZVKSED_U32_FROM_U8_LE(ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 0)), \
+                        ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 1)), \
+                        ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 2)), \
+                        ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 3)))
+
+// Perform the linear transformation L to a 32 bit word S and xor it with a 32
+// bit word X - section 6.2.2. of the IETF draft.
+#define ZVKSED_ROUND(X, S) \
+  ((X) ^ \
+   ((S) ^ ZVK_ROL32((S), 2) ^ ZVK_ROL32((S), 10) ^ \
+    ZVK_ROL32((S), 18) ^ ZVK_ROL32((S), 24)))
+
+// Perform the linear transformation L' to a 32 bit word S and xor it with a 32
+// bit word X - section 6.2.2. of the IETF draft.
+#define ZVKSED_ROUND_KEY(X, S) \
+  ((X) ^ ((S) ^ ZVK_ROL32((S), 13) ^ ZVK_ROL32((S), 23)))
+
+#endif // RISCV_ZVKSED_MACROS_H_

From a55f96ae9380d5cc9bef05e8b9e82e54d5d6ec35 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Thu, 1 Jun 2023 18:09:07 -0700
Subject: [PATCH 084/110] Zvk: Implement Zvksh, vector SM3 Hash Function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the Zvksh sub-extension, "ShangMi Suite: SM3 Hash
Function Instructions":
 - vsm3me.vv, message expansion,
 - vsm3c.vi, compression rounds.

This also introduces a SM3 specific header for common logic.

Co-authored-by: Raghav Gupta <rgupta@rivosinc.com>
Co-authored-by: Albert Jakieła <aja@semihalf.com>
Co-authored-by: Kornel Dulęba <mindal@semihalf.com>
Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 riscv/insns/vsm3c_vi.h   | 60 ++++++++++++++++++++++++++++++++++++++++
 riscv/insns/vsm3me_vv.h  | 39 ++++++++++++++++++++++++++
 riscv/riscv.mk.in        |  5 ++++
 riscv/zvksh_ext_macros.h | 47 +++++++++++++++++++++++++++++++
 4 files changed, 151 insertions(+)
 create mode 100644 riscv/insns/vsm3c_vi.h
 create mode 100644 riscv/insns/vsm3me_vv.h
 create mode 100644 riscv/zvksh_ext_macros.h

diff --git a/riscv/insns/vsm3c_vi.h b/riscv/insns/vsm3c_vi.h
new file mode 100644
index 0000000000..b3e81216f8
--- /dev/null
+++ b/riscv/insns/vsm3c_vi.h
@@ -0,0 +1,60 @@
+// vsm3c.vi vd, vs2, rnd
+
+#include "zvksh_ext_macros.h"
+
+require_vsm3_constraints;
+
+VI_ZVK_VD_VS2_ZIMM5_EGU32x8_NOVM_LOOP(
+  {},
+  // No need to validate or normalize 'zimm5' here as this is a 5 bits value
+  // and all values in 0-31 are valid.
+  const reg_t round = zimm5;,
+  {
+    // {H, G, F, E, D, C, B, A} <- vd
+    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vd, H, G, F, E, D, C, B, A);
+    // {_, _, w5, w4, _, _, w1, w0} <- vs2
+    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs2,
+                                   UNUSED _unused_w7, UNUSED _unused_w6, w5, w4,
+                                   UNUSED _unused_w3, UNUSED _unused_w2, w1, w0);
+    const uint32_t x0 = w0 ^ w4;  // W'[0] in spec documentation.
+    const uint32_t x1 = w1 ^ w5;  // W'[1]
+
+    // Two rounds of compression.
+    uint32_t ss1;
+    uint32_t ss2;
+    uint32_t tt1;
+    uint32_t tt2;
+    uint32_t j;
+
+    j = 2 * round;
+    ss1 = ZVK_ROL32(ZVK_ROL32(A, 12) + E + ZVK_ROL32(ZVKSH_T(j), j % 32), 7);
+    ss2 = ss1 ^ ZVK_ROL32(A, 12);
+    tt1 = ZVKSH_FF(A, B, C, j) + D + ss2 + x0;
+    tt2 = ZVKSH_GG(E, F, G, j) + H + ss1 + w0;
+    D = C;
+    const uint32_t C1 = ZVK_ROL32(B, 9);
+    B = A;
+    const uint32_t A1 = tt1;
+    H = G;
+    const uint32_t G1 = ZVK_ROL32(F, 19);
+    F = E;
+    const uint32_t E1 = ZVKSH_P0(tt2);
+
+    j = 2 * round + 1;
+    ss1 = ZVK_ROL32(ZVK_ROL32(A1, 12) + E1 + ZVK_ROL32(ZVKSH_T(j), j % 32), 7);
+    ss2 = ss1 ^ ZVK_ROL32(A1, 12);
+    tt1 = ZVKSH_FF(A1, B, C1, j) + D + ss2 + x1;
+    tt2 = ZVKSH_GG(E1, F, G1, j) + H + ss1 + w1;
+    D = C1;
+    const uint32_t C2 = ZVK_ROL32(B, 9);
+    B = A1;
+    const uint32_t A2 = tt1;
+    H = G1;
+    const uint32_t G2 = ZVK_ROL32(F, 19);
+    F = E1;
+    const uint32_t E2 = ZVKSH_P0(tt2);
+
+    // Update the destination register.
+    SET_EGU32x8_WORDS_BE_BSWAP(vd, G1, G2, E1, E2, C1, C2, A1, A2);
+  }
+);
diff --git a/riscv/insns/vsm3me_vv.h b/riscv/insns/vsm3me_vv.h
new file mode 100644
index 0000000000..dd6cb523f2
--- /dev/null
+++ b/riscv/insns/vsm3me_vv.h
@@ -0,0 +1,39 @@
+// vsm3me.vv vd, vs2, vs1
+
+#include "zvk_ext_macros.h"
+#include "zvksh_ext_macros.h"
+
+// Per the SM3 spec, the message expansion computes new words Wi as:
+//   W[i] = (    P_1( W[i-16] xor W[i-9] xor ( W[i-3] <<< 15 ) )
+//           xor ( W[i-13] <<< 7 )
+//           xor W[i-6]))
+// Using arguments M16 = W[i-16], M9 = W[i-9], etc.,
+// where Mk stands for "W[i Minus k]", we define the "W function":
+#define ZVKSH_W(M16, M9, M3, M13, M6) \
+  (ZVKSH_P1((M16) ^  (M9) ^ ZVK_ROL32((M3), 15)) ^ ZVK_ROL32((M13), 7) ^ (M6))
+
+require_vsm3_constraints;
+
+VI_ZVK_VD_VS1_VS2_EGU32x8_NOVM_LOOP(
+  {},
+  {
+    // {w7,  w6,  w5,  w4,  w3,  w2,  w1,  w0} <- vs1
+    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs1, w7, w6, w5, w4, w3, w2, w1, w0);
+    // {w15, w14, w13, w12, w11, w10, w9, w8} <- vs2
+    EXTRACT_EGU32x8_WORDS_BE_BSWAP(vs2, w15, w14, w13, w12, w11, w10, w9, w8);
+
+    // Arguments are W[i-16], W[i-9], W[i-13], W[i-6].
+    // Note that some of the newly computed words are used in later invocations.
+    const uint32_t w16 = ZVKSH_W(w0,  w7, w13,  w3, w10);
+    const uint32_t w17 = ZVKSH_W(w1,  w8, w14,  w4, w11);
+    const uint32_t w18 = ZVKSH_W(w2,  w9, w15,  w5, w12);
+    const uint32_t w19 = ZVKSH_W(w3, w10, w16,  w6, w13);
+    const uint32_t w20 = ZVKSH_W(w4, w11, w17,  w7, w14);
+    const uint32_t w21 = ZVKSH_W(w5, w12, w18,  w8, w15);
+    const uint32_t w22 = ZVKSH_W(w6, w13, w19,  w9, w16);
+    const uint32_t w23 = ZVKSH_W(w7, w14, w20, w10, w17);
+
+    // Update the destination register.
+    SET_EGU32x8_WORDS_BE_BSWAP(vd, w23, w22, w21, w20, w19, w18, w17, w16);
+  }
+);
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index c774e1bf28..a3e125f5d8 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -1392,6 +1392,10 @@ riscv_insn_ext_zvksed = \
 	vsm4r_vs \
 	vsm4r_vv \
 
+riscv_insn_ext_zvksh = \
+	vsm3c_vi \
+	vsm3me_vv \
+
 riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
 	$(riscv_insn_ext_zvbc) \
@@ -1399,6 +1403,7 @@ riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvkned) \
 	$(riscv_insn_ext_zvknh) \
 	$(riscv_insn_ext_zvksed) \
+	$(riscv_insn_ext_zvksh) \
 
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \
diff --git a/riscv/zvksh_ext_macros.h b/riscv/zvksh_ext_macros.h
new file mode 100644
index 0000000000..71c5a09149
--- /dev/null
+++ b/riscv/zvksh_ext_macros.h
@@ -0,0 +1,47 @@
+// Helper macros and functions to help implement instructions defined as part of
+// the RISC-V Zvksh extension (vectorized SM3).
+
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_INSNS_ZVKSH_COMMON_H_
+#define RISCV_INSNS_ZVKSH_COMMON_H_
+
+// Constraints common to all vsm3* instructions:
+//  - Zvksh is enabled
+//  - VSEW == 32
+//  - EGW (256) <= LMUL * VLEN
+//  - No overlap of vd and vs2.
+//
+// The constraint that vstart and vl are both EGS (8) aligned
+// is checked in the VI_ZVK_..._EGU32x8_..._LOOP macros.
+#define require_vsm3_constraints \
+  do { \
+    require_zvksh; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(256); \
+    require(insn.rd() != insn.rs2()); \
+  } while (false)
+
+#define FF1(X, Y, Z) ((X) ^ (Y) ^ (Z))
+#define FF2(X, Y, Z) (((X) & (Y)) | ((X) & (Z)) | ((Y) & (Z)))
+
+// Boolean function FF_j - section 4.3. of the IETF draft.
+#define ZVKSH_FF(X, Y, Z, J) (((J) <= 15) ? FF1(X, Y, Z) : FF2(X, Y, Z))
+
+#define GG1(X, Y, Z) ((X) ^ (Y) ^ (Z))
+#define GG2(X, Y, Z) (((X) & (Y)) | ((~(X)) & (Z)))
+
+// Boolean function GG_j - section 4.3. of the IETF draft.
+#define ZVKSH_GG(X, Y, Z, J) (((J) <= 15) ? GG1(X, Y, Z) : GG2(X, Y, Z))
+
+#define T1 0x79CC4519
+#define T2 0x7A879D8A
+
+// T_j constant - section 4.2. of the IETF draft.
+#define ZVKSH_T(J) (((J) <= 15) ? (T1) : (T2))
+
+// Permutation functions P_0 and P_1 - section 4.4 of the IETF draft.
+#define ZVKSH_P0(X) ((X) ^ ZVK_ROL32((X),  9) ^ ZVK_ROL32((X), 17))
+#define ZVKSH_P1(X) ((X) ^ ZVK_ROL32((X), 15) ^ ZVK_ROL32((X), 23))
+
+#endif // RISCV_INSNS_ZVKSH_COMMON_H

From e1101a13ae8ba0effea1f4647da52cb1c273a105 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <ego@rivosinc.com>
Date: Tue, 20 Jun 2023 00:21:42 -0700
Subject: [PATCH 085/110] Zvk: disassembler support

Add disassembler support for all instructions in Zvk extensions:
 - Zvbb (bitmanip)
 - Zvbc (carryless multiplication)
 - Zvkg (GMAC)
 - Zvkned (AES)
 - Zvknha / Zvknhb (SHA-256, SHA-512)
 - Zvksed (SM4)
 - Zvksh (SM3)

Macros are used to limit code duplication, following the example
of the base V extension. Because the V extension undefines some
of its macros after their use, there Zvk support does define some
similar macros.

Co-authored-by: Gianluca Guida <gianluca@rivosinc.com>
Signed-off-by: Eric Gouriou <ego@rivosinc.com>
---
 disasm/disasm.cc | 98 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/disasm/disasm.cc b/disasm/disasm.cc
index 096c38f0f3..6f93d241ec 100644
--- a/disasm/disasm.cc
+++ b/disasm/disasm.cc
@@ -187,6 +187,12 @@ struct : public arg_t {
   }
 } zimm5;
 
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string(insn.v_zimm6());
+  }
+} v_zimm6;
+
 struct : public arg_t {
   std::string to_string(insn_t insn) const {
     int32_t target = insn.sb_imm();
@@ -678,6 +684,11 @@ static void NOINLINE add_vector_viu_insn(disassembler_t* d, const char* name, ui
   d->add_insn(new disasm_insn_t(name, match, mask, {&vd, &vs2, &zimm5, opt, &vm}));
 }
 
+static void NOINLINE add_vector_viu_z6_insn(disassembler_t* d, const char* name, uint32_t match, uint32_t mask)
+{
+  d->add_insn(new disasm_insn_t(name, match, mask, {&vd, &vs2, &v_zimm6, opt, &vm}));
+}
+
 static void NOINLINE add_vector_vvm_insn(disassembler_t* d, const char* name, uint32_t match, uint32_t mask)
 {
   d->add_insn(new disasm_insn_t(name, match, mask, {&vd, &vs2, &vs1, &v0}));
@@ -2222,6 +2233,93 @@ void disassembler_t::add_instructions(const isa_parser_t* isa)
     DEFINE_R1TYPE(sm3p1);
   }
 
+  if (isa->extension_enabled(EXT_ZVBB)) {
+#define DEFINE_VECTOR_VIU_ZIMM6(code) \
+  add_vector_viu_z6_insn(this, #code, match_##code, mask_##code)
+#define DISASM_VECTOR_VV_VX(name) \
+  DEFINE_VECTOR_VV(name##_vv); \
+  DEFINE_VECTOR_VX(name##_vx)
+#define DISASM_VECTOR_VV_VX_VIU(name) \
+  DEFINE_VECTOR_VV(name##_vv); \
+  DEFINE_VECTOR_VX(name##_vx); \
+  DEFINE_VECTOR_VIU(name##_vx)
+#define DISASM_VECTOR_VV_VX_VIU_ZIMM6(name) \
+  DEFINE_VECTOR_VV(name##_vv); \
+  DEFINE_VECTOR_VX(name##_vx); \
+  DEFINE_VECTOR_VIU_ZIMM6(name##_vi)
+
+    DISASM_VECTOR_VV_VX(vandn);
+    DEFINE_VECTOR_V(vbrev_v);
+    DEFINE_VECTOR_V(vbrev8_v);
+    DEFINE_VECTOR_V(vrev8_v);
+    DEFINE_VECTOR_V(vclz_v);
+    DEFINE_VECTOR_V(vctz_v);
+    DEFINE_VECTOR_V(vcpop_v);
+    DISASM_VECTOR_VV_VX(vrol);
+    DISASM_VECTOR_VV_VX_VIU_ZIMM6(vror);
+    DISASM_VECTOR_VV_VX_VIU(vwsll);
+
+#undef DEFINE_VECTOR_VIU_ZIMM6
+#undef DISASM_VECTOR_VV_VX
+#undef DISASM_VECTOR_VV_VX_VIU
+#undef DISASM_VECTOR_VV_VX_VIU_ZIMM6
+    }
+
+  if (isa->extension_enabled(EXT_ZVBC)) {
+#define DISASM_VECTOR_VV_VX(name) \
+    DEFINE_VECTOR_VV(name##_vv); \
+    DEFINE_VECTOR_VX(name##_vx)
+
+    DISASM_VECTOR_VV_VX(vclmul);
+    DISASM_VECTOR_VV_VX(vclmulh);
+
+#undef DISASM_VECTOR_VV_VX
+  }
+
+  if (isa->extension_enabled(EXT_ZVKG)) {
+    // Despite its suffix, the vgmul.vv instruction
+    // is really ".v", with the form "vgmul.vv vd, vs2".
+    DEFINE_VECTOR_V(vgmul_vv);
+    DEFINE_VECTOR_VV(vghsh_vv);
+  }
+
+  if (isa->extension_enabled(EXT_ZVKNED)) {
+    // Despite their suffixes, the vaes*.{vv,vs} instructions
+    // are really ".v", with the form "<op>.{vv,vs} vd, vs2".
+#define DISASM_VECTOR_VV_VS(name) \
+    DEFINE_VECTOR_V(name##_vv); \
+    DEFINE_VECTOR_V(name##_vs)
+
+    DISASM_VECTOR_VV_VS(vaesdm);
+    DISASM_VECTOR_VV_VS(vaesdf);
+    DISASM_VECTOR_VV_VS(vaesem);
+    DISASM_VECTOR_VV_VS(vaesef);
+
+    DEFINE_VECTOR_V(vaesz_vs);
+    DEFINE_VECTOR_VIU(vaeskf1_vi);
+    DEFINE_VECTOR_VIU(vaeskf2_vi);
+#undef DISASM_VECTOR_VV_VS
+  }
+
+  if (isa->extension_enabled(EXT_ZVKNHA) ||
+      isa->extension_enabled(EXT_ZVKNHB)) {
+    DEFINE_VECTOR_VV(vsha2ms_vv);
+    DEFINE_VECTOR_VV(vsha2ch_vv);
+    DEFINE_VECTOR_VV(vsha2cl_vv);
+  }
+
+  if (isa->extension_enabled(EXT_ZVKSED)) {
+    DEFINE_VECTOR_VIU(vsm4k_vi);
+    // Despite their suffixes, the vsm4r.{vv,vs} instructions
+    // are really ".v", with the form "vsm4r.{vv,vs} vd, vs2".
+    DEFINE_VECTOR_V(vsm4r_vv);
+    DEFINE_VECTOR_V(vsm4r_vs);
+  }
+
+  if (isa->extension_enabled(EXT_ZVKSH)) {
+    DEFINE_VECTOR_VIU(vsm3c_vi);
+    DEFINE_VECTOR_VV(vsm3me_vv);
+  }
 }
 
 disassembler_t::disassembler_t(const isa_parser_t *isa)

From 59e8b9fab6d96acf74f78f6a7db8cc2005d4fa70 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Tue, 6 Jun 2023 13:03:22 -0700
Subject: [PATCH 086/110] device_t: Add missing overrides to derived
 abstract_device_t classes

---
 riscv/devices.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/riscv/devices.h b/riscv/devices.h
index 02d9e98068..b9f639d45b 100644
--- a/riscv/devices.h
+++ b/riscv/devices.h
@@ -17,8 +17,8 @@ class simif_t;
 
 class bus_t : public abstract_device_t {
  public:
-  bool load(reg_t addr, size_t len, uint8_t* bytes);
-  bool store(reg_t addr, size_t len, const uint8_t* bytes);
+  bool load(reg_t addr, size_t len, uint8_t* bytes) override;
+  bool store(reg_t addr, size_t len, const uint8_t* bytes) override;
   void add_device(reg_t addr, abstract_device_t* dev);
 
   std::pair<reg_t, abstract_device_t*> find_device(reg_t addr);
@@ -30,8 +30,8 @@ class bus_t : public abstract_device_t {
 class rom_device_t : public abstract_device_t {
  public:
   rom_device_t(std::vector<char> data);
-  bool load(reg_t addr, size_t len, uint8_t* bytes);
-  bool store(reg_t addr, size_t len, const uint8_t* bytes);
+  bool load(reg_t addr, size_t len, uint8_t* bytes) override;
+  bool store(reg_t addr, size_t len, const uint8_t* bytes) override;
   const std::vector<char>& contents() { return data; }
  private:
   std::vector<char> data;
@@ -43,8 +43,8 @@ class mem_t : public abstract_device_t {
   mem_t(const mem_t& that) = delete;
   ~mem_t();
 
-  bool load(reg_t addr, size_t len, uint8_t* bytes) { return load_store(addr, len, bytes, false); }
-  bool store(reg_t addr, size_t len, const uint8_t* bytes) { return load_store(addr, len, const_cast<uint8_t*>(bytes), true); }
+  bool load(reg_t addr, size_t len, uint8_t* bytes) override { return load_store(addr, len, bytes, false); }
+  bool store(reg_t addr, size_t len, const uint8_t* bytes) override { return load_store(addr, len, const_cast<uint8_t*>(bytes), true); }
   char* contents(reg_t addr);
   reg_t size() { return sz; }
   void dump(std::ostream& o);
@@ -59,8 +59,8 @@ class mem_t : public abstract_device_t {
 class clint_t : public abstract_device_t {
  public:
   clint_t(simif_t*, uint64_t freq_hz, bool real_time);
-  bool load(reg_t addr, size_t len, uint8_t* bytes);
-  bool store(reg_t addr, size_t len, const uint8_t* bytes);
+  bool load(reg_t addr, size_t len, uint8_t* bytes) override;
+  bool store(reg_t addr, size_t len, const uint8_t* bytes) override;
   size_t size() { return CLINT_SIZE; }
   void increment(reg_t inc);
   uint64_t get_mtimecmp(reg_t hartid) { return mtimecmp[hartid]; }
@@ -98,9 +98,9 @@ struct plic_context_t {
 class plic_t : public abstract_device_t, public abstract_interrupt_controller_t {
  public:
   plic_t(simif_t*, uint32_t ndev);
-  bool load(reg_t addr, size_t len, uint8_t* bytes);
-  bool store(reg_t addr, size_t len, const uint8_t* bytes);
-  void set_interrupt_level(uint32_t id, int lvl);
+  bool load(reg_t addr, size_t len, uint8_t* bytes) override;
+  bool store(reg_t addr, size_t len, const uint8_t* bytes) override;
+  void set_interrupt_level(uint32_t id, int lvl) override;
   size_t size() { return PLIC_SIZE; }
  private:
   std::vector<plic_context_t> contexts;
@@ -129,8 +129,8 @@ class ns16550_t : public abstract_device_t {
  public:
   ns16550_t(class bus_t *bus, abstract_interrupt_controller_t *intctrl,
             uint32_t interrupt_id, uint32_t reg_shift, uint32_t reg_io_width);
-  bool load(reg_t addr, size_t len, uint8_t* bytes);
-  bool store(reg_t addr, size_t len, const uint8_t* bytes);
+  bool load(reg_t addr, size_t len, uint8_t* bytes) override;
+  bool store(reg_t addr, size_t len, const uint8_t* bytes) override;
   void tick(void);
   size_t size() { return NS16550_SIZE; }
  private:

From 803d85bac7b1dc5e491b735080cbcd7af7eac03e Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Thu, 1 Jun 2023 19:29:57 -0700
Subject: [PATCH 087/110] sim_t: change devices to shared_ptrs

---
 riscv/sim.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/riscv/sim.h b/riscv/sim.h
index 3109173f19..e5a73030be 100644
--- a/riscv/sim.h
+++ b/riscv/sim.h
@@ -70,10 +70,10 @@ class sim_t : public htif_t, public simif_t
   std::string dts;
   std::string dtb;
   bool dtb_enabled;
-  std::unique_ptr<rom_device_t> boot_rom;
-  std::unique_ptr<clint_t> clint;
-  std::unique_ptr<plic_t> plic;
-  std::unique_ptr<ns16550_t> ns16550;
+  std::shared_ptr<rom_device_t> boot_rom;
+  std::shared_ptr<clint_t> clint;
+  std::shared_ptr<plic_t> plic;
+  std::shared_ptr<ns16550_t> ns16550;
   bus_t bus;
   log_file_t log_file;
 

From 20793b36b79db337187964da3df397c1da576c23 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Thu, 1 Jun 2023 19:35:55 -0700
Subject: [PATCH 088/110] sim_t: Add list of ptrs to devices to sim_t

---
 riscv/sim.cc | 4 ++++
 riscv/sim.h  | 1 +
 2 files changed, 5 insertions(+)

diff --git a/riscv/sim.cc b/riscv/sim.cc
index dcbd469d32..82619e7b93 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -123,6 +123,7 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
   if (fdt_parse_clint(fdt, &clint_base, "riscv,clint0") == 0) {
     clint.reset(new clint_t(this, CPU_HZ / INSNS_PER_RTC_TICK, cfg->real_time_clint()));
     bus.add_device(clint_base, clint.get());
+    devices.push_back(clint);
   }
 
   // pointer to wired interrupt controller
@@ -134,6 +135,7 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
   if (fdt_parse_plic(fdt, &plic_base, &plic_ndev, "riscv,plic0") == 0) {
     plic.reset(new plic_t(this, plic_ndev));
     bus.add_device(plic_base, plic.get());
+    devices.push_back(plic);
     intctrl = plic.get();
   }
 
@@ -146,6 +148,7 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
     ns16550.reset(new ns16550_t(&bus, intctrl, NS16550_INTERRUPT_ID,
                                 ns16550_shift, ns16550_io_width));
     bus.add_device(ns16550_base, ns16550.get());
+    devices.push_back(ns16550);
   }
 
   //per core attribute
@@ -376,6 +379,7 @@ void sim_t::set_rom()
 
   boot_rom.reset(new rom_device_t(rom));
   bus.add_device(DEFAULT_RSTVEC, boot_rom.get());
+  devices.push_back(boot_rom);
 }
 
 char* sim_t::addr_to_mem(reg_t paddr) {
diff --git a/riscv/sim.h b/riscv/sim.h
index e5a73030be..3afeeddc21 100644
--- a/riscv/sim.h
+++ b/riscv/sim.h
@@ -70,6 +70,7 @@ class sim_t : public htif_t, public simif_t
   std::string dts;
   std::string dtb;
   bool dtb_enabled;
+  std::vector<std::shared_ptr<abstract_device_t>> devices;
   std::shared_ptr<rom_device_t> boot_rom;
   std::shared_ptr<clint_t> clint;
   std::shared_ptr<plic_t> plic;

From e47fc7075110fd97ee3fc96a4e67acb4a3b9c5fa Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Fri, 2 Jun 2023 09:15:03 -0700
Subject: [PATCH 089/110] clint: Change clint_t::increment to override
 abstract_device_t::tick(rtc_ticks)

---
 riscv/abstract_device.h |  2 ++
 riscv/clint.cc          | 10 +++++-----
 riscv/devices.h         |  4 ++--
 riscv/ns16550.cc        |  2 +-
 riscv/sim.cc            |  5 +++--
 5 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/riscv/abstract_device.h b/riscv/abstract_device.h
index 559c64f6d6..f4ccebe230 100644
--- a/riscv/abstract_device.h
+++ b/riscv/abstract_device.h
@@ -2,6 +2,7 @@
 #define _RISCV_ABSTRACT_DEVICE_H
 
 #include "decode.h"
+#include "common.h"
 #include <cstdint>
 #include <cstddef>
 
@@ -10,6 +11,7 @@ class abstract_device_t {
   virtual bool load(reg_t addr, size_t len, uint8_t* bytes) = 0;
   virtual bool store(reg_t addr, size_t len, const uint8_t* bytes) = 0;
   virtual ~abstract_device_t() {}
+  virtual void tick(reg_t UNUSED rtc_ticks) {}
 };
 
 #endif
diff --git a/riscv/clint.cc b/riscv/clint.cc
index f27f02c342..485c997843 100644
--- a/riscv/clint.cc
+++ b/riscv/clint.cc
@@ -12,7 +12,7 @@ clint_t::clint_t(simif_t* sim, uint64_t freq_hz, bool real_time)
 
   real_time_ref_secs = base.tv_sec;
   real_time_ref_usecs = base.tv_usec;
-  increment(0);
+  tick(0);
 }
 
 /* 0000 msip hart 0
@@ -34,7 +34,7 @@ bool clint_t::load(reg_t addr, size_t len, uint8_t* bytes)
   if (len > 8)
     return false;
 
-  increment(0);
+  tick(0);
 
   if (addr >= MSIP_BASE && addr < MTIMECMP_BASE) {
     if (len == 8) {
@@ -90,11 +90,11 @@ bool clint_t::store(reg_t addr, size_t len, const uint8_t* bytes)
   } else {
     return false;
   }
-  increment(0);
+  tick(0);
   return true;
 }
 
-void clint_t::increment(reg_t inc)
+void clint_t::tick(reg_t rtc_ticks)
 {
   if (real_time) {
    struct timeval now;
@@ -104,7 +104,7 @@ void clint_t::increment(reg_t inc)
    diff_usecs = ((now.tv_sec - real_time_ref_secs) * 1000000) + (now.tv_usec - real_time_ref_usecs);
    mtime = diff_usecs * freq_hz / 1000000;
   } else {
-    mtime += inc;
+    mtime += rtc_ticks;
   }
 
   for (const auto& [hart_id, hart] : sim->get_harts()) {
diff --git a/riscv/devices.h b/riscv/devices.h
index b9f639d45b..a62509ae5b 100644
--- a/riscv/devices.h
+++ b/riscv/devices.h
@@ -62,7 +62,7 @@ class clint_t : public abstract_device_t {
   bool load(reg_t addr, size_t len, uint8_t* bytes) override;
   bool store(reg_t addr, size_t len, const uint8_t* bytes) override;
   size_t size() { return CLINT_SIZE; }
-  void increment(reg_t inc);
+  void tick(reg_t rtc_ticks) override;
   uint64_t get_mtimecmp(reg_t hartid) { return mtimecmp[hartid]; }
   uint64_t get_mtime() { return mtime; }
  private:
@@ -131,7 +131,7 @@ class ns16550_t : public abstract_device_t {
             uint32_t interrupt_id, uint32_t reg_shift, uint32_t reg_io_width);
   bool load(reg_t addr, size_t len, uint8_t* bytes) override;
   bool store(reg_t addr, size_t len, const uint8_t* bytes) override;
-  void tick(void);
+  void tick(reg_t rtc_ticks) override;
   size_t size() { return NS16550_SIZE; }
  private:
   class bus_t *bus;
diff --git a/riscv/ns16550.cc b/riscv/ns16550.cc
index 8d7e4de2d7..d21983be70 100644
--- a/riscv/ns16550.cc
+++ b/riscv/ns16550.cc
@@ -292,7 +292,7 @@ bool ns16550_t::store(reg_t addr, size_t len, const uint8_t* bytes)
   return ret;
 }
 
-void ns16550_t::tick(void)
+void ns16550_t::tick(reg_t UNUSED rtc_ticks)
 {
   if (!(fcr & UART_FCR_ENABLE_FIFO) ||
       (mcr & UART_MCR_LOOP) ||
diff --git a/riscv/sim.cc b/riscv/sim.cc
index 82619e7b93..77ed4c7a09 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -245,8 +245,9 @@ void sim_t::step(size_t n)
       procs[current_proc]->get_mmu()->yield_load_reservation();
       if (++current_proc == procs.size()) {
         current_proc = 0;
-        if (clint) clint->increment(INTERLEAVE / INSNS_PER_RTC_TICK);
-        if (ns16550) ns16550->tick();
+        reg_t rtc_ticks = INTERLEAVE / INSNS_PER_RTC_TICK;
+        if (clint) clint->tick(rtc_ticks);
+        if (ns16550) ns16550->tick(rtc_ticks);
       }
     }
   }

From e733a70d0565bcee9aeba27b654df1a52dff08fe Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Fri, 2 Jun 2023 09:18:11 -0700
Subject: [PATCH 090/110] sim_t: Tick all devices, not just clint and ns16550

---
 riscv/sim.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/riscv/sim.cc b/riscv/sim.cc
index 77ed4c7a09..9d0bfb82e9 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -246,8 +246,7 @@ void sim_t::step(size_t n)
       if (++current_proc == procs.size()) {
         current_proc = 0;
         reg_t rtc_ticks = INTERLEAVE / INSNS_PER_RTC_TICK;
-        if (clint) clint->tick(rtc_ticks);
-        if (ns16550) ns16550->tick(rtc_ticks);
+        for (auto &dev : devices) dev->tick(rtc_ticks);
       }
     }
   }

From 6456b5ad25a2b7efb6c4f9ccd28e00a5408eb743 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Fri, 2 Jun 2023 09:27:09 -0700
Subject: [PATCH 091/110] sim_t: Remove boot_rom/ns16550 members of sim_t

These are redundant with sim_t::devices
---
 riscv/sim.cc | 6 +++---
 riscv/sim.h  | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/riscv/sim.cc b/riscv/sim.cc
index 9d0bfb82e9..4fa49b2cf5 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -145,8 +145,8 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
   if (fdt_parse_ns16550(fdt, &ns16550_base,
                         &ns16550_shift, &ns16550_io_width, "ns16550a") == 0) {
     assert(intctrl);
-    ns16550.reset(new ns16550_t(&bus, intctrl, NS16550_INTERRUPT_ID,
-                                ns16550_shift, ns16550_io_width));
+    std::shared_ptr<ns16550_t> ns16550(new ns16550_t(&bus, intctrl, NS16550_INTERRUPT_ID,
+                                                     ns16550_shift, ns16550_io_width));
     bus.add_device(ns16550_base, ns16550.get());
     devices.push_back(ns16550);
   }
@@ -377,7 +377,7 @@ void sim_t::set_rom()
   const int align = 0x1000;
   rom.resize((rom.size() + align - 1) / align * align);
 
-  boot_rom.reset(new rom_device_t(rom));
+  std::shared_ptr<rom_device_t> boot_rom(new rom_device_t(rom));
   bus.add_device(DEFAULT_RSTVEC, boot_rom.get());
   devices.push_back(boot_rom);
 }
diff --git a/riscv/sim.h b/riscv/sim.h
index 3afeeddc21..a851643125 100644
--- a/riscv/sim.h
+++ b/riscv/sim.h
@@ -71,10 +71,8 @@ class sim_t : public htif_t, public simif_t
   std::string dtb;
   bool dtb_enabled;
   std::vector<std::shared_ptr<abstract_device_t>> devices;
-  std::shared_ptr<rom_device_t> boot_rom;
   std::shared_ptr<clint_t> clint;
   std::shared_ptr<plic_t> plic;
-  std::shared_ptr<ns16550_t> ns16550;
   bus_t bus;
   log_file_t log_file;
 

From 426a33e77438f956d0890391af7bb7ed9b7a20fc Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Fri, 2 Jun 2023 12:07:22 -0700
Subject: [PATCH 092/110] sim_t: change plugin_devices to a vec of shared_ptrs

---
 ci-tests/testlib.c  | 2 +-
 riscv/sim.cc        | 4 ++--
 riscv/sim.h         | 4 ++--
 spike_main/spike.cc | 7 ++-----
 4 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/ci-tests/testlib.c b/ci-tests/testlib.c
index 3d5438b586..33eaede2e5 100644
--- a/ci-tests/testlib.c
+++ b/ci-tests/testlib.c
@@ -28,7 +28,7 @@ int main()
             hartids,
             false,
             4);
-  std::vector<std::pair<reg_t, abstract_device_t*>> plugin_devices;
+  std::vector<std::pair<reg_t, std::shared_ptr<abstract_device_t>>> plugin_devices;
   std::vector<std::string> htif_args {"pk", "hello"};
   debug_module_config_t dm_config = {
     .progbufsize = 2,
diff --git a/riscv/sim.cc b/riscv/sim.cc
index 4fa49b2cf5..43c91f6106 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -34,7 +34,7 @@ const size_t sim_t::INTERLEAVE;
 
 sim_t::sim_t(const cfg_t *cfg, bool halted,
              std::vector<std::pair<reg_t, mem_t*>> mems,
-             std::vector<std::pair<reg_t, abstract_device_t*>> plugin_devices,
+             std::vector<std::pair<reg_t, std::shared_ptr<abstract_device_t>>> plugin_devices,
              const std::vector<std::string>& args,
              const debug_module_config_t &dm_config,
              const char *log_path,
@@ -67,7 +67,7 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
     bus.add_device(x.first, x.second);
 
   for (auto& x : plugin_devices)
-    bus.add_device(x.first, x.second);
+    bus.add_device(x.first, x.second.get());
 
   debug_module.add_device(&bus);
 
diff --git a/riscv/sim.h b/riscv/sim.h
index a851643125..ba956661ec 100644
--- a/riscv/sim.h
+++ b/riscv/sim.h
@@ -27,7 +27,7 @@ class sim_t : public htif_t, public simif_t
 public:
   sim_t(const cfg_t *cfg, bool halted,
         std::vector<std::pair<reg_t, mem_t*>> mems,
-        std::vector<std::pair<reg_t, abstract_device_t*>> plugin_devices,
+        std::vector<std::pair<reg_t, std::shared_ptr<abstract_device_t>>> plugin_devices,
         const std::vector<std::string>& args,
         const debug_module_config_t &dm_config, const char *log_path,
         bool dtb_enabled, const char *dtb_file,
@@ -63,7 +63,7 @@ class sim_t : public htif_t, public simif_t
   isa_parser_t isa;
   const cfg_t * const cfg;
   std::vector<std::pair<reg_t, mem_t*>> mems;
-  std::vector<std::pair<reg_t, abstract_device_t*>> plugin_devices;
+  std::vector<std::pair<reg_t, std::shared_ptr<abstract_device_t>>> plugin_devices;
   std::vector<processor_t*> procs;
   std::map<size_t, processor_t*> harts;
   std::pair<reg_t, reg_t> initrd_range;
diff --git a/spike_main/spike.cc b/spike_main/spike.cc
index 7290f38bbd..f257582ffb 100644
--- a/spike_main/spike.cc
+++ b/spike_main/spike.cc
@@ -336,7 +336,7 @@ int main(int argc, char** argv)
   bool dtb_enabled = true;
   const char* kernel = NULL;
   reg_t kernel_offset, kernel_size;
-  std::vector<std::pair<reg_t, abstract_device_t*>> plugin_devices;
+  std::vector<std::pair<reg_t, std::shared_ptr<abstract_device_t>>> plugin_devices;
   std::unique_ptr<icache_sim_t> ic;
   std::unique_ptr<dcache_sim_t> dc;
   std::unique_ptr<cache_sim_t> l2;
@@ -416,7 +416,7 @@ int main(int argc, char** argv)
     std::string args(avail, '\0');
     stream.readsome(&args[0], avail);
 
-    plugin_devices.emplace_back(base, new mmio_plugin_device_t(name, args));
+    plugin_devices.emplace_back(base, std::make_shared<mmio_plugin_device_t>(name, args));
   };
 
   option_parser_t parser;
@@ -602,8 +602,5 @@ int main(int argc, char** argv)
   for (auto& mem : mems)
     delete mem.second;
 
-  for (auto& plugin_device : plugin_devices)
-    delete plugin_device.second;
-
   return return_code;
 }

From 1bd44c71a13f3d8d25de112fb5346589c03e332d Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Fri, 2 Jun 2023 12:10:38 -0700
Subject: [PATCH 093/110] sim_t: Merge sim_t::plugin_devices with
 sim_t::devices

---
 riscv/sim.cc | 5 +++--
 riscv/sim.h  | 1 -
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/riscv/sim.cc b/riscv/sim.cc
index 43c91f6106..877d5c2586 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -45,7 +45,6 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
     isa(cfg->isa(), cfg->priv()),
     cfg(cfg),
     mems(mems),
-    plugin_devices(plugin_devices),
     procs(std::max(cfg->nprocs(), size_t(1))),
     dtb_enabled(dtb_enabled),
     log_file(log_path),
@@ -66,8 +65,10 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
   for (auto& x : mems)
     bus.add_device(x.first, x.second);
 
-  for (auto& x : plugin_devices)
+  for (auto& x : plugin_devices) {
     bus.add_device(x.first, x.second.get());
+    devices.push_back(x.second);
+  }
 
   debug_module.add_device(&bus);
 
diff --git a/riscv/sim.h b/riscv/sim.h
index ba956661ec..6e6907891d 100644
--- a/riscv/sim.h
+++ b/riscv/sim.h
@@ -63,7 +63,6 @@ class sim_t : public htif_t, public simif_t
   isa_parser_t isa;
   const cfg_t * const cfg;
   std::vector<std::pair<reg_t, mem_t*>> mems;
-  std::vector<std::pair<reg_t, std::shared_ptr<abstract_device_t>>> plugin_devices;
   std::vector<processor_t*> procs;
   std::map<size_t, processor_t*> harts;
   std::pair<reg_t, reg_t> initrd_range;

From cd0bd1bda701f5004f9667d0c87a8b65f54d30e3 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Fri, 2 Jun 2023 16:39:45 -0700
Subject: [PATCH 094/110] sim_t: Make static consts public members

---
 riscv/sim.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/riscv/sim.h b/riscv/sim.h
index 6e6907891d..7689d54e74 100644
--- a/riscv/sim.h
+++ b/riscv/sim.h
@@ -59,6 +59,10 @@ class sim_t : public htif_t, public simif_t
   // Callback for processors to let the simulation know they were reset.
   virtual void proc_reset(unsigned id) override;
 
+  static const size_t INTERLEAVE = 5000;
+  static const size_t INSNS_PER_RTC_TICK = 100; // 10 MHz clock for 1 BIPS core
+  static const size_t CPU_HZ = 1000000000; // 1GHz CPU
+
 private:
   isa_parser_t isa;
   const cfg_t * const cfg;
@@ -82,9 +86,6 @@ class sim_t : public htif_t, public simif_t
 
   processor_t* get_core(const std::string& i);
   void step(size_t n); // step through simulation
-  static const size_t INTERLEAVE = 5000;
-  static const size_t INSNS_PER_RTC_TICK = 100; // 10 MHz clock for 1 BIPS core
-  static const size_t CPU_HZ = 1000000000; // 1GHz CPU
   size_t current_step;
   size_t current_proc;
   bool debug;

From 81218a2e0201e3ec3d7520fbe2f3466609bd613f Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Fri, 2 Jun 2023 20:56:00 -0700
Subject: [PATCH 095/110] ns16550_t: remove unused bus_t member

---
 riscv/devices.h  | 3 +--
 riscv/ns16550.cc | 4 ++--
 riscv/sim.cc     | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/riscv/devices.h b/riscv/devices.h
index a62509ae5b..fe19f74a73 100644
--- a/riscv/devices.h
+++ b/riscv/devices.h
@@ -127,14 +127,13 @@ class plic_t : public abstract_device_t, public abstract_interrupt_controller_t
 
 class ns16550_t : public abstract_device_t {
  public:
-  ns16550_t(class bus_t *bus, abstract_interrupt_controller_t *intctrl,
+  ns16550_t(abstract_interrupt_controller_t *intctrl,
             uint32_t interrupt_id, uint32_t reg_shift, uint32_t reg_io_width);
   bool load(reg_t addr, size_t len, uint8_t* bytes) override;
   bool store(reg_t addr, size_t len, const uint8_t* bytes) override;
   void tick(reg_t rtc_ticks) override;
   size_t size() { return NS16550_SIZE; }
  private:
-  class bus_t *bus;
   abstract_interrupt_controller_t *intctrl;
   uint32_t interrupt_id;
   uint32_t reg_shift;
diff --git a/riscv/ns16550.cc b/riscv/ns16550.cc
index d21983be70..475d5ec134 100644
--- a/riscv/ns16550.cc
+++ b/riscv/ns16550.cc
@@ -69,9 +69,9 @@
 
 #define UART_SCR                7 /* I/O: Scratch Register */
 
-ns16550_t::ns16550_t(class bus_t *bus, abstract_interrupt_controller_t *intctrl,
+ns16550_t::ns16550_t(abstract_interrupt_controller_t *intctrl,
                      uint32_t interrupt_id, uint32_t reg_shift, uint32_t reg_io_width)
-  : bus(bus), intctrl(intctrl), interrupt_id(interrupt_id), reg_shift(reg_shift), reg_io_width(reg_io_width), backoff_counter(0)
+  : intctrl(intctrl), interrupt_id(interrupt_id), reg_shift(reg_shift), reg_io_width(reg_io_width), backoff_counter(0)
 {
   ier = 0;
   iir = UART_IIR_NO_INT;
diff --git a/riscv/sim.cc b/riscv/sim.cc
index 877d5c2586..858ace3f96 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -146,7 +146,7 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
   if (fdt_parse_ns16550(fdt, &ns16550_base,
                         &ns16550_shift, &ns16550_io_width, "ns16550a") == 0) {
     assert(intctrl);
-    std::shared_ptr<ns16550_t> ns16550(new ns16550_t(&bus, intctrl, NS16550_INTERRUPT_ID,
+    std::shared_ptr<ns16550_t> ns16550(new ns16550_t(intctrl, NS16550_INTERRUPT_ID,
                                                      ns16550_shift, ns16550_io_width));
     bus.add_device(ns16550_base, ns16550.get());
     devices.push_back(ns16550);

From 5b39c69bfadd507f22e794baae3f4ef342303462 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Fri, 2 Jun 2023 21:26:52 -0700
Subject: [PATCH 096/110] devices: Pass const pointers to sim_t to clint/plic

---
 riscv/clint.cc  | 2 +-
 riscv/devices.h | 6 +++---
 riscv/plic.cc   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/riscv/clint.cc b/riscv/clint.cc
index 485c997843..25e45fbb79 100644
--- a/riscv/clint.cc
+++ b/riscv/clint.cc
@@ -3,7 +3,7 @@
 #include "processor.h"
 #include "simif.h"
 
-clint_t::clint_t(simif_t* sim, uint64_t freq_hz, bool real_time)
+clint_t::clint_t(const simif_t* sim, uint64_t freq_hz, bool real_time)
   : sim(sim), freq_hz(freq_hz), real_time(real_time), mtime(0)
 {
   struct timeval base;
diff --git a/riscv/devices.h b/riscv/devices.h
index fe19f74a73..11cc3479cb 100644
--- a/riscv/devices.h
+++ b/riscv/devices.h
@@ -58,7 +58,7 @@ class mem_t : public abstract_device_t {
 
 class clint_t : public abstract_device_t {
  public:
-  clint_t(simif_t*, uint64_t freq_hz, bool real_time);
+  clint_t(const simif_t*, uint64_t freq_hz, bool real_time);
   bool load(reg_t addr, size_t len, uint8_t* bytes) override;
   bool store(reg_t addr, size_t len, const uint8_t* bytes) override;
   size_t size() { return CLINT_SIZE; }
@@ -69,7 +69,7 @@ class clint_t : public abstract_device_t {
   typedef uint64_t mtime_t;
   typedef uint64_t mtimecmp_t;
   typedef uint32_t msip_t;
-  simif_t* sim;
+  const simif_t* sim;
   uint64_t freq_hz;
   bool real_time;
   uint64_t real_time_ref_secs;
@@ -97,7 +97,7 @@ struct plic_context_t {
 
 class plic_t : public abstract_device_t, public abstract_interrupt_controller_t {
  public:
-  plic_t(simif_t*, uint32_t ndev);
+  plic_t(const simif_t*, uint32_t ndev);
   bool load(reg_t addr, size_t len, uint8_t* bytes) override;
   bool store(reg_t addr, size_t len, const uint8_t* bytes) override;
   void set_interrupt_level(uint32_t id, int lvl) override;
diff --git a/riscv/plic.cc b/riscv/plic.cc
index 37a5f53ba0..e2685a9c28 100644
--- a/riscv/plic.cc
+++ b/riscv/plic.cc
@@ -70,7 +70,7 @@
 
 #define REG_SIZE                0x1000000
 
-plic_t::plic_t(simif_t* sim, uint32_t ndev)
+plic_t::plic_t(const simif_t* sim, uint32_t ndev)
   : num_ids(ndev + 1), num_ids_word(((ndev + 1) + (32 - 1)) / 32),
   max_prio((1UL << PLIC_PRIO_BITS) - 1), priority{}, level{}
 {

From fa27eeb3b77cdeeefabdbf225cd17edad21a5e97 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Fri, 2 Jun 2023 21:30:15 -0700
Subject: [PATCH 097/110] dts: void* fdt arg to parse_fdt should be const

---
 riscv/dts.cc | 22 +++++++++++-----------
 riscv/dts.h  | 18 +++++++++---------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/riscv/dts.cc b/riscv/dts.cc
index 200288ef2c..bd1f4fe1f2 100644
--- a/riscv/dts.cc
+++ b/riscv/dts.cc
@@ -215,7 +215,7 @@ std::string dts_compile(const std::string& dts)
   return dtb.str();
 }
 
-static int fdt_get_node_addr_size(void *fdt, int node, reg_t *addr,
+static int fdt_get_node_addr_size(const void *fdt, int node, reg_t *addr,
                                   unsigned long *size, const char *field)
 {
   int parent, len, i;
@@ -259,7 +259,7 @@ static int fdt_get_node_addr_size(void *fdt, int node, reg_t *addr,
   return 0;
 }
 
-static int check_cpu_node(void *fdt, int cpu_offset)
+static int check_cpu_node(const void *fdt, int cpu_offset)
 {
   int len;
   const void *prop;
@@ -276,22 +276,22 @@ static int check_cpu_node(void *fdt, int cpu_offset)
   return 0;
 }
 
-int fdt_get_offset(void *fdt, const char *field)
+int fdt_get_offset(const void *fdt, const char *field)
 {
   return fdt_path_offset(fdt, field);
 }
 
-int fdt_get_first_subnode(void *fdt, int node)
+int fdt_get_first_subnode(const void *fdt, int node)
 {
   return fdt_first_subnode(fdt, node);
 }
 
-int fdt_get_next_subnode(void *fdt, int node)
+int fdt_get_next_subnode(const void *fdt, int node)
 {
   return fdt_next_subnode(fdt, node);
 }
 
-int fdt_parse_clint(void *fdt, reg_t *clint_addr,
+int fdt_parse_clint(const void *fdt, reg_t *clint_addr,
                     const char *compatible)
 {
   int nodeoffset, rc;
@@ -307,7 +307,7 @@ int fdt_parse_clint(void *fdt, reg_t *clint_addr,
   return 0;
 }
 
-int fdt_parse_plic(void *fdt, reg_t *plic_addr, uint32_t *ndev,
+int fdt_parse_plic(const void *fdt, reg_t *plic_addr, uint32_t *ndev,
                    const char *compatible)
 {
   int nodeoffset, len, rc;
@@ -329,7 +329,7 @@ int fdt_parse_plic(void *fdt, reg_t *plic_addr, uint32_t *ndev,
   return 0;
 }
 
-int fdt_parse_ns16550(void *fdt, reg_t *ns16550_addr,
+int fdt_parse_ns16550(const void *fdt, reg_t *ns16550_addr,
                       uint32_t *reg_shift, uint32_t *reg_io_width,
                       const char *compatible)
 {
@@ -365,7 +365,7 @@ int fdt_parse_ns16550(void *fdt, reg_t *ns16550_addr,
   return 0;
 }
 
-int fdt_parse_pmp_num(void *fdt, int cpu_offset, reg_t *pmp_num)
+int fdt_parse_pmp_num(const void *fdt, int cpu_offset, reg_t *pmp_num)
 {
   int rc;
 
@@ -380,7 +380,7 @@ int fdt_parse_pmp_num(void *fdt, int cpu_offset, reg_t *pmp_num)
   return 0;
 }
 
-int fdt_parse_pmp_alignment(void *fdt, int cpu_offset, reg_t *pmp_align)
+int fdt_parse_pmp_alignment(const void *fdt, int cpu_offset, reg_t *pmp_align)
 {
   int rc;
 
@@ -395,7 +395,7 @@ int fdt_parse_pmp_alignment(void *fdt, int cpu_offset, reg_t *pmp_align)
   return 0;
 }
 
-int fdt_parse_mmu_type(void *fdt, int cpu_offset, const char **mmu_type)
+int fdt_parse_mmu_type(const void *fdt, int cpu_offset, const char **mmu_type)
 {
   assert(mmu_type);
 
diff --git a/riscv/dts.h b/riscv/dts.h
index 7a64d7bc63..2b7404e457 100644
--- a/riscv/dts.h
+++ b/riscv/dts.h
@@ -16,18 +16,18 @@ std::string make_dts(size_t insns_per_rtc_tick, size_t cpu_hz,
 
 std::string dts_compile(const std::string& dts);
 
-int fdt_get_offset(void *fdt, const char *field);
-int fdt_get_first_subnode(void *fdt, int node);
-int fdt_get_next_subnode(void *fdt, int node);
+int fdt_get_offset(const void *fdt, const char *field);
+int fdt_get_first_subnode(const void *fdt, int node);
+int fdt_get_next_subnode(const void *fdt, int node);
 
-int fdt_parse_clint(void *fdt, reg_t *clint_addr,
+int fdt_parse_clint(const void *fdt, reg_t *clint_addr,
                     const char *compatible);
-int fdt_parse_plic(void *fdt, reg_t *plic_addr, uint32_t *ndev,
+int fdt_parse_plic(const void *fdt, reg_t *plic_addr, uint32_t *ndev,
                    const char *compatible);
-int fdt_parse_ns16550(void *fdt, reg_t *ns16550_addr,
+int fdt_parse_ns16550(const void *fdt, reg_t *ns16550_addr,
                       uint32_t *reg_shift, uint32_t *reg_io_width,
                       const char *compatible);
-int fdt_parse_pmp_num(void *fdt, int cpu_offset, reg_t *pmp_num);
-int fdt_parse_pmp_alignment(void *fdt, int cpu_offset, reg_t *pmp_align);
-int fdt_parse_mmu_type(void *fdt, int cpu_offset, const char **mmu_type);
+int fdt_parse_pmp_num(const void *fdt, int cpu_offset, reg_t *pmp_num);
+int fdt_parse_pmp_alignment(const void *fdt, int cpu_offset, reg_t *pmp_align);
+int fdt_parse_mmu_type(const void *fdt, int cpu_offset, const char **mmu_type);
 #endif

From b9275b7ce23937b6ba0345eb30ebb9c82dbd3c3f Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sun, 4 Jun 2023 21:37:04 -0700
Subject: [PATCH 098/110] Inline make_dtb into sim_t constructor

make_dtb is only called here, this simplifies later work
towards refactoring device DTS node generation
---
 riscv/sim.cc | 64 ++++++++++++++++++++++++----------------------------
 riscv/sim.h  |  1 -
 2 files changed, 29 insertions(+), 36 deletions(-)

diff --git a/riscv/sim.cc b/riscv/sim.cc
index 858ace3f96..10e86be2c7 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -109,7 +109,35 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
   if (!dtb_enabled) return;
 
   // Load dtb_file if provided, otherwise self-generate a dts/dtb
-  make_dtb(dtb_file);
+  if (dtb_file) {
+    std::ifstream fin(dtb_file, std::ios::binary);
+    if (!fin.good()) {
+      std::cerr << "can't find dtb file: " << dtb_file << std::endl;
+      exit(-1);
+    }
+    std::stringstream strstream;
+    strstream << fin.rdbuf();
+
+    dtb = strstream.str();
+  } else {
+    std::pair<reg_t, reg_t> initrd_bounds = cfg->initrd_bounds();
+    dts = make_dts(INSNS_PER_RTC_TICK, CPU_HZ,
+                   initrd_bounds.first, initrd_bounds.second,
+                   cfg->bootargs(), cfg->pmpregions, procs, mems);
+    dtb = dts_compile(dts);
+  }
+
+  int fdt_code = fdt_check_header(dtb.c_str());
+  if (fdt_code) {
+    std::cerr << "Failed to read DTB from ";
+    if (!dtb_file) {
+      std::cerr << "auto-generated DTS string";
+    } else {
+      std::cerr << "`" << dtb_file << "'";
+    }
+    std::cerr << ": " << fdt_strerror(fdt_code) << ".\n";
+    exit(-1);
+  }
 
   void *fdt = (void *)dtb.c_str();
 
@@ -303,40 +331,6 @@ bool sim_t::mmio_store(reg_t paddr, size_t len, const uint8_t* bytes)
   return bus.store(paddr, len, bytes);
 }
 
-void sim_t::make_dtb(const char* dtb_file)
-{
-  if (dtb_file) {
-    std::ifstream fin(dtb_file, std::ios::binary);
-    if (!fin.good()) {
-      std::cerr << "can't find dtb file: " << dtb_file << std::endl;
-      exit(-1);
-    }
-
-    std::stringstream strstream;
-    strstream << fin.rdbuf();
-
-    dtb = strstream.str();
-  } else {
-    std::pair<reg_t, reg_t> initrd_bounds = cfg->initrd_bounds();
-    dts = make_dts(INSNS_PER_RTC_TICK, CPU_HZ,
-                   initrd_bounds.first, initrd_bounds.second,
-                   cfg->bootargs(), cfg->pmpregions, procs, mems);
-    dtb = dts_compile(dts);
-  }
-
-  int fdt_code = fdt_check_header(dtb.c_str());
-  if (fdt_code) {
-    std::cerr << "Failed to read DTB from ";
-    if (!dtb_file) {
-      std::cerr << "auto-generated DTS string";
-    } else {
-      std::cerr << "`" << dtb_file << "'";
-    }
-    std::cerr << ": " << fdt_strerror(fdt_code) << ".\n";
-    exit(-1);
-  }
-}
-
 void sim_t::set_rom()
 {
   const int reset_vec_size = 8;
diff --git a/riscv/sim.h b/riscv/sim.h
index 7689d54e74..7f08ec191b 100644
--- a/riscv/sim.h
+++ b/riscv/sim.h
@@ -98,7 +98,6 @@ class sim_t : public htif_t, public simif_t
   virtual char* addr_to_mem(reg_t paddr) override;
   virtual bool mmio_load(reg_t paddr, size_t len, uint8_t* bytes) override;
   virtual bool mmio_store(reg_t paddr, size_t len, const uint8_t* bytes) override;
-  void make_dtb(const char* dtb_file);
   void set_rom();
 
   virtual const char* get_symbol(uint64_t paddr) override;

From b2ab751ce317929a179fdad06ad12bdcc2be42a7 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Mon, 5 Jun 2023 10:25:57 -0700
Subject: [PATCH 099/110] sim_t: Add sim_t::get_intctrl

---
 riscv/sim.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/riscv/sim.h b/riscv/sim.h
index 7f08ec191b..1cb0658088 100644
--- a/riscv/sim.h
+++ b/riscv/sim.h
@@ -52,6 +52,7 @@ class sim_t : public htif_t, public simif_t
   }
   const char* get_dts() { return dts.c_str(); }
   processor_t* get_core(size_t i) { return procs.at(i); }
+  abstract_interrupt_controller_t* get_intctrl() const { assert(plic.get()); return plic.get(); }
   virtual const cfg_t &get_cfg() const override { return *cfg; }
 
   virtual const std::map<size_t, processor_t*>& get_harts() const override { return harts; }

From 0beed2cc22b20093b5ee0fdab343c525503b8a16 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Mon, 5 Jun 2023 10:28:20 -0700
Subject: [PATCH 100/110] device_t: Add device_factory_t

This class should implement conditional fdt-based device instantiation,
as well as adding device nodes to the dts
---
 riscv/abstract_device.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/riscv/abstract_device.h b/riscv/abstract_device.h
index f4ccebe230..7239b667af 100644
--- a/riscv/abstract_device.h
+++ b/riscv/abstract_device.h
@@ -5,6 +5,9 @@
 #include "common.h"
 #include <cstdint>
 #include <cstddef>
+#include <string>
+
+class sim_t;
 
 class abstract_device_t {
  public:
@@ -14,4 +17,20 @@ class abstract_device_t {
   virtual void tick(reg_t UNUSED rtc_ticks) {}
 };
 
+// factory for devices which should show up in the DTS, and can be
+// parameterized by parsing the DTS
+class device_factory_t {
+public:
+  virtual abstract_device_t* parse_from_fdt(const void* fdt, const sim_t* sim, reg_t* base) = 0;
+  virtual std::string generate_dts(const sim_t* sim) = 0;
+  virtual ~device_factory_t() {}
+};
+
+#define REGISTER_DEVICE(name, parse, generate) \
+  class name##_factory_t : public device_factory_t { \
+  public: \
+  name##_t* parse_from_fdt(const void* fdt, const sim_t* sim, reg_t* base) override { return parse(fdt, sim, base); } \
+  std::string generate_dts(const sim_t* sim) override { return generate(sim); } \
+  }; device_factory_t *name##_factory = new name##_factory_t();
+
 #endif

From e5a61098733b34f8906e26827f94afdc9df277df Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Mon, 5 Jun 2023 10:31:44 -0700
Subject: [PATCH 101/110] device_t: Add device_factory_t's for
 ns16550/clint/plic

---
 riscv/clint.cc   | 31 +++++++++++++++++++++++++++++++
 riscv/ns16550.cc | 37 +++++++++++++++++++++++++++++++++++++
 riscv/plic.cc    | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 105 insertions(+)

diff --git a/riscv/clint.cc b/riscv/clint.cc
index 25e45fbb79..908ccb606a 100644
--- a/riscv/clint.cc
+++ b/riscv/clint.cc
@@ -1,7 +1,10 @@
 #include <sys/time.h>
+#include <sstream>
 #include "devices.h"
 #include "processor.h"
 #include "simif.h"
+#include "sim.h"
+#include "dts.h"
 
 clint_t::clint_t(const simif_t* sim, uint64_t freq_hz, bool real_time)
   : sim(sim), freq_hz(freq_hz), real_time(real_time), mtime(0)
@@ -112,3 +115,31 @@ void clint_t::tick(reg_t rtc_ticks)
     hart->state.mip->backdoor_write_with_mask(MIP_MTIP, mtime >= mtimecmp[hart_id] ? MIP_MTIP : 0);
   }
 }
+
+clint_t* clint_parse_from_fdt(const void* fdt, const sim_t* sim, reg_t* base) {
+  if (fdt_parse_clint(fdt, base, "riscv,clint0") == 0)
+    return new clint_t(sim,
+                       sim->CPU_HZ / sim->INSNS_PER_RTC_TICK,
+                       sim->get_cfg().real_time_clint());
+  else
+    return nullptr;
+}
+
+std::string clint_generate_dts(const sim_t* sim) {
+  std::stringstream s;
+  s << std::hex
+    << "    clint@" << CLINT_BASE << " {\n"
+       "      compatible = \"riscv,clint0\";\n"
+       "      interrupts-extended = <" << std::dec;
+  for (size_t i = 0; i < sim->get_cfg().nprocs(); i++)
+    s << "&CPU" << i << "_intc 3 &CPU" << i << "_intc 7 ";
+  reg_t clintbs = CLINT_BASE;
+  reg_t clintsz = CLINT_SIZE;
+  s << std::hex << ">;\n"
+    "      reg = <0x" << (clintbs >> 32) << " 0x" << (clintbs & (uint32_t)-1) <<
+    " 0x" << (clintsz >> 32) << " 0x" << (clintsz & (uint32_t)-1) << ">;\n"
+    "    };\n";
+  return s.str();
+}
+
+REGISTER_DEVICE(clint, clint_parse_from_fdt, clint_generate_dts)
diff --git a/riscv/ns16550.cc b/riscv/ns16550.cc
index 475d5ec134..a4bd204fde 100644
--- a/riscv/ns16550.cc
+++ b/riscv/ns16550.cc
@@ -1,7 +1,10 @@
 #include <sys/time.h>
+#include <sstream>
 #include "devices.h"
 #include "processor.h"
 #include "term.h"
+#include "sim.h"
+#include "dts.h"
 
 #define UART_QUEUE_SIZE         64
 
@@ -317,3 +320,37 @@ void ns16550_t::tick(reg_t UNUSED rtc_ticks)
   lsr |= UART_LSR_DR;
   update_interrupt();
 }
+
+std::string ns16550_generate_dts(const sim_t* sim)
+{
+  std::stringstream s;
+  s << std::hex
+    << "    SERIAL0: ns16550@" << NS16550_BASE << " {\n"
+       "      compatible = \"ns16550a\";\n"
+       "      clock-frequency = <" << std::dec << (sim->CPU_HZ/sim->INSNS_PER_RTC_TICK) << ">;\n"
+       "      interrupt-parent = <&PLIC>;\n"
+       "      interrupts = <" << std::dec << NS16550_INTERRUPT_ID;
+  reg_t ns16550bs = NS16550_BASE;
+  reg_t ns16550sz = NS16550_SIZE;
+  s << std::hex << ">;\n"
+       "      reg = <0x" << (ns16550bs >> 32) << " 0x" << (ns16550bs & (uint32_t)-1) <<
+                   " 0x" << (ns16550sz >> 32) << " 0x" << (ns16550sz & (uint32_t)-1) << ">;\n"
+       "      reg-shift = <0x" << NS16550_REG_SHIFT << ">;\n"
+       "      reg-io-width = <0x" << NS16550_REG_IO_WIDTH << ">;\n"
+       "    };\n";
+  return s.str();
+}
+
+ns16550_t* ns16550_parse_from_fdt(const void* fdt, const sim_t* sim, reg_t* base)
+{
+  uint32_t ns16550_shift, ns16550_io_width;
+  if (fdt_parse_ns16550(fdt, base,
+                        &ns16550_shift, &ns16550_io_width, "ns16550a") == 0) {
+    abstract_interrupt_controller_t* intctrl = sim->get_intctrl();
+    return new ns16550_t(intctrl, NS16550_INTERRUPT_ID, ns16550_shift, ns16550_io_width);
+  } else {
+    return nullptr;
+  }
+}
+
+REGISTER_DEVICE(ns16550, ns16550_parse_from_fdt, ns16550_generate_dts)
diff --git a/riscv/plic.cc b/riscv/plic.cc
index e2685a9c28..1aa5852cb1 100644
--- a/riscv/plic.cc
+++ b/riscv/plic.cc
@@ -1,7 +1,10 @@
 #include <sys/time.h>
+#include <sstream>
 #include "devices.h"
 #include "processor.h"
 #include "simif.h"
+#include "sim.h"
+#include "dts.h"
 
 #define PLIC_MAX_CONTEXTS 15872
 
@@ -388,3 +391,37 @@ bool plic_t::store(reg_t addr, size_t len, const uint8_t* bytes)
 
   return ret;
 }
+
+std::string plic_generate_dts(const sim_t* sim)
+{
+  std::stringstream s;
+  s << std::hex
+    << "    PLIC: plic@" << PLIC_BASE << " {\n"
+       "      compatible = \"riscv,plic0\";\n"
+       "      #address-cells = <2>;\n"
+       "      interrupts-extended = <" << std::dec;
+  for (size_t i = 0; i < sim->get_cfg().nprocs(); i++)
+    s << "&CPU" << i << "_intc 11 &CPU" << i << "_intc 9 ";
+  reg_t plicbs = PLIC_BASE;
+  reg_t plicsz = PLIC_SIZE;
+  s << std::hex << ">;\n"
+      "      reg = <0x" << (plicbs >> 32) << " 0x" << (plicbs & (uint32_t)-1) <<
+      " 0x" << (plicsz >> 32) << " 0x" << (plicsz & (uint32_t)-1) << ">;\n"
+      "      riscv,ndev = <0x" << PLIC_NDEV << ">;\n"
+      "      riscv,max-priority = <0x" << ((1U << PLIC_PRIO_BITS) - 1) << ">;\n"
+      "      #interrupt-cells = <1>;\n"
+      "      interrupt-controller;\n"
+      "    };\n";
+  return s.str();
+}
+
+plic_t* plic_parse_from_fdt(const void* fdt, const sim_t* sim, reg_t* base)
+{
+  uint32_t plic_ndev;
+  if (fdt_parse_plic(fdt, base, &plic_ndev, "riscv,plic0") == 0)
+    return new plic_t(sim, plic_ndev);
+  else
+    return nullptr;
+}
+
+REGISTER_DEVICE(plic, plic_parse_from_fdt, plic_generate_dts)

From 6ae3783b08b86303836babcc7a8b473cf37b7c64 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Mon, 5 Jun 2023 10:36:54 -0700
Subject: [PATCH 102/110] sim_t: Move dts device node construction/parsing to
 device_factories

---
 riscv/dts.cc | 46 +++--------------------------
 riscv/dts.h  |  3 +-
 riscv/sim.cc | 82 +++++++++++++++++++++++++---------------------------
 3 files changed, 46 insertions(+), 85 deletions(-)

diff --git a/riscv/dts.cc b/riscv/dts.cc
index bd1f4fe1f2..4b8de9ff92 100644
--- a/riscv/dts.cc
+++ b/riscv/dts.cc
@@ -17,7 +17,8 @@ std::string make_dts(size_t insns_per_rtc_tick, size_t cpu_hz,
                      const char* bootargs,
                      size_t pmpregions,
                      std::vector<processor_t*> procs,
-                     std::vector<std::pair<reg_t, mem_t*>> mems)
+                     std::vector<std::pair<reg_t, mem_t*>> mems,
+                     std::string device_nodes)
 {
   std::stringstream s;
   s << std::dec <<
@@ -85,47 +86,8 @@ std::string make_dts(size_t insns_per_rtc_tick, size_t cpu_hz,
          "    #size-cells = <2>;\n"
          "    compatible = \"ucbbar,spike-bare-soc\", \"simple-bus\";\n"
          "    ranges;\n"
-         "    clint@" << CLINT_BASE << " {\n"
-         "      compatible = \"riscv,clint0\";\n"
-         "      interrupts-extended = <" << std::dec;
-  for (size_t i = 0; i < procs.size(); i++)
-    s << "&CPU" << i << "_intc 3 &CPU" << i << "_intc 7 ";
-  reg_t clintbs = CLINT_BASE;
-  reg_t clintsz = CLINT_SIZE;
-  s << std::hex << ">;\n"
-         "      reg = <0x" << (clintbs >> 32) << " 0x" << (clintbs & (uint32_t)-1) <<
-                     " 0x" << (clintsz >> 32) << " 0x" << (clintsz & (uint32_t)-1) << ">;\n"
-         "    };\n"
-         "    PLIC: plic@" << PLIC_BASE << " {\n"
-         "      compatible = \"riscv,plic0\";\n"
-         "      #address-cells = <2>;\n"
-         "      interrupts-extended = <" << std::dec;
-  for (size_t i = 0; i < procs.size(); i++)
-    s << "&CPU" << i << "_intc 11 &CPU" << i << "_intc 9 ";
-  reg_t plicbs = PLIC_BASE;
-  reg_t plicsz = PLIC_SIZE;
-  s << std::hex << ">;\n"
-         "      reg = <0x" << (plicbs >> 32) << " 0x" << (plicbs & (uint32_t)-1) <<
-                     " 0x" << (plicsz >> 32) << " 0x" << (plicsz & (uint32_t)-1) << ">;\n"
-         "      riscv,ndev = <0x" << PLIC_NDEV << ">;\n"
-         "      riscv,max-priority = <0x" << ((1U << PLIC_PRIO_BITS) - 1) << ">;\n"
-         "      #interrupt-cells = <1>;\n"
-         "      interrupt-controller;\n"
-         "    };\n"
-         "    SERIAL0: ns16550@" << NS16550_BASE << " {\n"
-         "      compatible = \"ns16550a\";\n"
-         "      clock-frequency = <" << std::dec << (cpu_hz/insns_per_rtc_tick) << ">;\n"
-         "      interrupt-parent = <&PLIC>;\n"
-         "      interrupts = <" << std::dec << NS16550_INTERRUPT_ID;
-  reg_t ns16550bs = NS16550_BASE;
-  reg_t ns16550sz = NS16550_SIZE;
-  s << std::hex << ">;\n"
-         "      reg = <0x" << (ns16550bs >> 32) << " 0x" << (ns16550bs & (uint32_t)-1) <<
-                     " 0x" << (ns16550sz >> 32) << " 0x" << (ns16550sz & (uint32_t)-1) << ">;\n"
-         "      reg-shift = <0x" << NS16550_REG_SHIFT << ">;\n"
-         "      reg-io-width = <0x" << NS16550_REG_IO_WIDTH << ">;\n"
-         "    };\n"
-         "  };\n"
+    << device_nodes
+    <<   "  };\n"
          "  htif {\n"
          "    compatible = \"ucb,htif0\";\n"
          "  };\n"
diff --git a/riscv/dts.h b/riscv/dts.h
index 2b7404e457..d3655025f4 100644
--- a/riscv/dts.h
+++ b/riscv/dts.h
@@ -12,7 +12,8 @@ std::string make_dts(size_t insns_per_rtc_tick, size_t cpu_hz,
                      const char* bootargs,
                      size_t pmpregions,
                      std::vector<processor_t*> procs,
-                     std::vector<std::pair<reg_t, mem_t*>> mems);
+                     std::vector<std::pair<reg_t, mem_t*>> mems,
+                     std::string device_nodes);
 
 std::string dts_compile(const std::string& dts);
 
diff --git a/riscv/sim.cc b/riscv/sim.cc
index 10e86be2c7..3283e5c2f9 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -32,6 +32,10 @@ static void handle_signal(int sig)
 
 const size_t sim_t::INTERLEAVE;
 
+extern device_factory_t* clint_factory;
+extern device_factory_t* plic_factory;
+extern device_factory_t* ns16550_factory;
+
 sim_t::sim_t(const cfg_t *cfg, bool halted,
              std::vector<std::pair<reg_t, mem_t*>> mems,
              std::vector<std::pair<reg_t, std::shared_ptr<abstract_device_t>>> plugin_devices,
@@ -90,9 +94,9 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
 #ifndef RISCV_ENABLE_DUAL_ENDIAN
   if (cfg->endianness != endianness_little) {
     fputs("Big-endian support has not been prroperly enabled; "
-	  "please rebuild the riscv-isa-sim project using "
-	  "\"configure --enable-dual-endian\".\n",
-	  stderr);
+          "please rebuild the riscv-isa-sim project using "
+          "\"configure --enable-dual-endian\".\n",
+          stderr);
     abort();
   }
 #endif
@@ -108,6 +112,19 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
   // When running without using a dtb, skip the fdt-based configuration steps
   if (!dtb_enabled) return;
 
+  // Only make a CLINT (Core-Local INTerrupt controller) and PLIC (Platform-
+  // Level-Interrupt-Controller) if they are specified in the device tree
+  // configuration.
+  //
+  // This isn't *quite* as general as we could get (because you might have one
+  // that's not bus-accessible), but it should handle the normal use cases. In
+  // particular, the default device tree configuration that you get without
+  // setting the dtb_file argument has one.
+  std::vector<device_factory_t*> device_factories = {
+    clint_factory, // clint must be element 0
+    plic_factory, // plic must be element 1
+    ns16550_factory};
+
   // Load dtb_file if provided, otherwise self-generate a dts/dtb
   if (dtb_file) {
     std::ifstream fin(dtb_file, std::ios::binary);
@@ -117,13 +134,16 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
     }
     std::stringstream strstream;
     strstream << fin.rdbuf();
-
     dtb = strstream.str();
   } else {
     std::pair<reg_t, reg_t> initrd_bounds = cfg->initrd_bounds();
+    std::string device_nodes;
+    for (device_factory_t *factory : device_factories)
+      device_nodes.append(factory->generate_dts(this));
     dts = make_dts(INSNS_PER_RTC_TICK, CPU_HZ,
                    initrd_bounds.first, initrd_bounds.second,
-                   cfg->bootargs(), cfg->pmpregions, procs, mems);
+                   cfg->bootargs(), cfg->pmpregions, procs, mems,
+                   device_nodes);
     dtb = dts_compile(dts);
   }
 
@@ -141,43 +161,21 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
 
   void *fdt = (void *)dtb.c_str();
 
-  // Only make a CLINT (Core-Local INTerrupt controller) if one is specified in
-  // the device tree configuration.
-  //
-  // This isn't *quite* as general as we could get (because you might have one
-  // that's not bus-accessible), but it should handle the normal use cases. In
-  // particular, the default device tree configuration that you get without
-  // setting the dtb_file argument has one.
-  reg_t clint_base;
-  if (fdt_parse_clint(fdt, &clint_base, "riscv,clint0") == 0) {
-    clint.reset(new clint_t(this, CPU_HZ / INSNS_PER_RTC_TICK, cfg->real_time_clint()));
-    bus.add_device(clint_base, clint.get());
-    devices.push_back(clint);
-  }
-
-  // pointer to wired interrupt controller
-  abstract_interrupt_controller_t *intctrl = NULL;
-
-  // create plic
-  reg_t plic_base;
-  uint32_t plic_ndev;
-  if (fdt_parse_plic(fdt, &plic_base, &plic_ndev, "riscv,plic0") == 0) {
-    plic.reset(new plic_t(this, plic_ndev));
-    bus.add_device(plic_base, plic.get());
-    devices.push_back(plic);
-    intctrl = plic.get();
-  }
-
-  // create ns16550
-  reg_t ns16550_base;
-  uint32_t ns16550_shift, ns16550_io_width;
-  if (fdt_parse_ns16550(fdt, &ns16550_base,
-                        &ns16550_shift, &ns16550_io_width, "ns16550a") == 0) {
-    assert(intctrl);
-    std::shared_ptr<ns16550_t> ns16550(new ns16550_t(intctrl, NS16550_INTERRUPT_ID,
-                                                     ns16550_shift, ns16550_io_width));
-    bus.add_device(ns16550_base, ns16550.get());
-    devices.push_back(ns16550);
+  for (size_t i = 0; i < device_factories.size(); i++) {
+    device_factory_t *factory = device_factories[i];
+    reg_t device_base = 0;
+    abstract_device_t* device = factory->parse_from_fdt(fdt, this, &device_base);
+    if (device) {
+      assert(device_base);
+      bus.add_device(device_base, device);
+      std::shared_ptr<abstract_device_t> dev_ptr(device);
+      devices.push_back(dev_ptr);
+
+      if (i == 0) // clint_factory
+        clint = std::static_pointer_cast<clint_t>(dev_ptr);
+      else if (i == 1) // plic_factory
+        plic = std::static_pointer_cast<plic_t>(dev_ptr);
+    }
   }
 
   //per core attribute

From 3ab4107b81e7b6c42bf60010d1fa598e9058a7c9 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Mon, 5 Jun 2023 16:32:01 -0700
Subject: [PATCH 103/110] device_t: device_factories should be const

---
 riscv/abstract_device.h | 10 +++++-----
 riscv/sim.cc            |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/riscv/abstract_device.h b/riscv/abstract_device.h
index 7239b667af..90e2b24cbc 100644
--- a/riscv/abstract_device.h
+++ b/riscv/abstract_device.h
@@ -21,16 +21,16 @@ class abstract_device_t {
 // parameterized by parsing the DTS
 class device_factory_t {
 public:
-  virtual abstract_device_t* parse_from_fdt(const void* fdt, const sim_t* sim, reg_t* base) = 0;
-  virtual std::string generate_dts(const sim_t* sim) = 0;
+  virtual abstract_device_t* parse_from_fdt(const void* fdt, const sim_t* sim, reg_t* base) const = 0;
+  virtual std::string generate_dts(const sim_t* sim) const = 0;
   virtual ~device_factory_t() {}
 };
 
 #define REGISTER_DEVICE(name, parse, generate) \
   class name##_factory_t : public device_factory_t { \
   public: \
-  name##_t* parse_from_fdt(const void* fdt, const sim_t* sim, reg_t* base) override { return parse(fdt, sim, base); } \
-  std::string generate_dts(const sim_t* sim) override { return generate(sim); } \
-  }; device_factory_t *name##_factory = new name##_factory_t();
+  name##_t* parse_from_fdt(const void* fdt, const sim_t* sim, reg_t* base) const override { return parse(fdt, sim, base); } \
+  std::string generate_dts(const sim_t* sim) const override { return generate(sim); } \
+  }; const device_factory_t *name##_factory = new name##_factory_t();
 
 #endif
diff --git a/riscv/sim.cc b/riscv/sim.cc
index 3283e5c2f9..50dc4f68bd 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -120,7 +120,7 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
   // that's not bus-accessible), but it should handle the normal use cases. In
   // particular, the default device tree configuration that you get without
   // setting the dtb_file argument has one.
-  std::vector<device_factory_t*> device_factories = {
+  std::vector<const device_factory_t*> device_factories = {
     clint_factory, // clint must be element 0
     plic_factory, // plic must be element 1
     ns16550_factory};
@@ -138,7 +138,7 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
   } else {
     std::pair<reg_t, reg_t> initrd_bounds = cfg->initrd_bounds();
     std::string device_nodes;
-    for (device_factory_t *factory : device_factories)
+    for (const device_factory_t *factory : device_factories)
       device_nodes.append(factory->generate_dts(this));
     dts = make_dts(INSNS_PER_RTC_TICK, CPU_HZ,
                    initrd_bounds.first, initrd_bounds.second,
@@ -162,7 +162,7 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
   void *fdt = (void *)dtb.c_str();
 
   for (size_t i = 0; i < device_factories.size(); i++) {
-    device_factory_t *factory = device_factories[i];
+    const device_factory_t *factory = device_factories[i];
     reg_t device_base = 0;
     abstract_device_t* device = factory->parse_from_fdt(fdt, this, &device_base);
     if (device) {

From 16be75973aae3331bf97ba9452797e72909312c1 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Tue, 6 Jun 2023 10:04:50 -0700
Subject: [PATCH 104/110] libfdt: Install libfdt and libfdt.h

---
 fdt/fdt.mk.in | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fdt/fdt.mk.in b/fdt/fdt.mk.in
index 8c8dbe53c7..32c6d49295 100644
--- a/fdt/fdt.mk.in
+++ b/fdt/fdt.mk.in
@@ -1,5 +1,10 @@
 fdt_subproject_deps = \
 
+fdt_install_shared_lib = yes
+
+fdt_install_hdrs = \
+	libfdt.h \
+
 fdt_c_srcs = \
 	fdt.c \
 	fdt_ro.c \

From bb2754c2017f1062071ab820af36d3852cb9859d Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Tue, 6 Jun 2023 11:33:36 -0700
Subject: [PATCH 105/110] dts_t: Add dts.h to list of installed headers

---
 riscv/dts.h       | 1 -
 riscv/riscv.mk.in | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/riscv/dts.h b/riscv/dts.h
index d3655025f4..b6bb5b2792 100644
--- a/riscv/dts.h
+++ b/riscv/dts.h
@@ -4,7 +4,6 @@
 
 #include "devices.h"
 #include "processor.h"
-#include "mmu.h"
 #include <string>
 
 std::string make_dts(size_t insns_per_rtc_tick, size_t cpu_hz,
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index a3e125f5d8..d82df45e12 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -28,6 +28,7 @@ riscv_install_hdrs = \
 	decode.h \
 	devices.h \
 	disasm.h \
+	dts.h \
 	encoding.h \
 	entropy_source.h \
 	extension.h \

From 37e50ad49914725f6ba0364255e2ae809c60f0f2 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Tue, 6 Jun 2023 11:42:13 -0700
Subject: [PATCH 106/110] dts: Expose fdt_get_node_addr_size function in header

---
 riscv/dts.cc | 4 ++--
 riscv/dts.h  | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/riscv/dts.cc b/riscv/dts.cc
index 4b8de9ff92..8c1ceb46a3 100644
--- a/riscv/dts.cc
+++ b/riscv/dts.cc
@@ -177,8 +177,8 @@ std::string dts_compile(const std::string& dts)
   return dtb.str();
 }
 
-static int fdt_get_node_addr_size(const void *fdt, int node, reg_t *addr,
-                                  unsigned long *size, const char *field)
+int fdt_get_node_addr_size(const void *fdt, int node, reg_t *addr,
+                           unsigned long *size, const char *field)
 {
   int parent, len, i;
   int cell_addr, cell_size;
diff --git a/riscv/dts.h b/riscv/dts.h
index b6bb5b2792..10d3cc10bf 100644
--- a/riscv/dts.h
+++ b/riscv/dts.h
@@ -16,6 +16,8 @@ std::string make_dts(size_t insns_per_rtc_tick, size_t cpu_hz,
 
 std::string dts_compile(const std::string& dts);
 
+int fdt_get_node_addr_size(const void *fdt, int node, reg_t *addr,
+                           unsigned long *size, const char *field);
 int fdt_get_offset(const void *fdt, const char *field);
 int fdt_get_first_subnode(const void *fdt, int node);
 int fdt_get_next_subnode(const void *fdt, int node);

From 701029d28b0e73f98a36869ef9317c49f0dc2949 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Tue, 6 Jun 2023 11:21:21 -0700
Subject: [PATCH 107/110] ns16550_t: ns16550 should parse interrupt id from the
 fdt

---
 riscv/dts.cc     | 10 ++++++++++
 riscv/dts.h      |  2 +-
 riscv/ns16550.cc |  7 ++++---
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/riscv/dts.cc b/riscv/dts.cc
index 8c1ceb46a3..cc65e3ce5e 100644
--- a/riscv/dts.cc
+++ b/riscv/dts.cc
@@ -293,6 +293,7 @@ int fdt_parse_plic(const void *fdt, reg_t *plic_addr, uint32_t *ndev,
 
 int fdt_parse_ns16550(const void *fdt, reg_t *ns16550_addr,
                       uint32_t *reg_shift, uint32_t *reg_io_width,
+                      uint32_t* reg_int_id,
                       const char *compatible)
 {
   int nodeoffset, len, rc;
@@ -324,6 +325,15 @@ int fdt_parse_ns16550(const void *fdt, reg_t *ns16550_addr,
     }
   }
 
+  reg_p = (fdt32_t *)fdt_getprop(fdt, nodeoffset, "interrupts", &len);
+  if (reg_int_id) {
+    if (reg_p) {
+      *reg_int_id = fdt32_to_cpu(*reg_p);
+    } else {
+      *reg_int_id = NS16550_INTERRUPT_ID;
+    }
+  }
+
   return 0;
 }
 
diff --git a/riscv/dts.h b/riscv/dts.h
index 10d3cc10bf..7ec1ceb692 100644
--- a/riscv/dts.h
+++ b/riscv/dts.h
@@ -27,7 +27,7 @@ int fdt_parse_clint(const void *fdt, reg_t *clint_addr,
 int fdt_parse_plic(const void *fdt, reg_t *plic_addr, uint32_t *ndev,
                    const char *compatible);
 int fdt_parse_ns16550(const void *fdt, reg_t *ns16550_addr,
-                      uint32_t *reg_shift, uint32_t *reg_io_width,
+                      uint32_t *reg_shift, uint32_t *reg_io_width, uint32_t* reg_int_id,
                       const char *compatible);
 int fdt_parse_pmp_num(const void *fdt, int cpu_offset, reg_t *pmp_num);
 int fdt_parse_pmp_alignment(const void *fdt, int cpu_offset, reg_t *pmp_align);
diff --git a/riscv/ns16550.cc b/riscv/ns16550.cc
index a4bd204fde..dabe3a9b09 100644
--- a/riscv/ns16550.cc
+++ b/riscv/ns16550.cc
@@ -343,11 +343,12 @@ std::string ns16550_generate_dts(const sim_t* sim)
 
 ns16550_t* ns16550_parse_from_fdt(const void* fdt, const sim_t* sim, reg_t* base)
 {
-  uint32_t ns16550_shift, ns16550_io_width;
+  uint32_t ns16550_shift, ns16550_io_width, ns16550_int_id;
   if (fdt_parse_ns16550(fdt, base,
-                        &ns16550_shift, &ns16550_io_width, "ns16550a") == 0) {
+                        &ns16550_shift, &ns16550_io_width, &ns16550_int_id,
+                        "ns16550a") == 0) {
     abstract_interrupt_controller_t* intctrl = sim->get_intctrl();
-    return new ns16550_t(intctrl, NS16550_INTERRUPT_ID, ns16550_shift, ns16550_io_width);
+    return new ns16550_t(intctrl, ns16550_int_id, ns16550_shift, ns16550_io_width);
   } else {
     return nullptr;
   }

From 186c619fb38f02d0b18514a2f8399cd8248e1dcc Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Tue, 6 Jun 2023 11:53:02 -0700
Subject: [PATCH 108/110] devices: Switch plugin device interface to use
 device_factory_t

Plugins should now implement and register a device_factory_t to
configure how that device should be parsed from a FDT, and an optional
default DTS string.

This drops support for command-line flag-based device configuration
---
 ci-tests/testlib.c      |  2 +-
 riscv/abstract_device.h | 11 +++++
 riscv/devices.cc        | 46 +++------------------
 riscv/devices.h         | 14 -------
 riscv/mmio_plugin.h     | 91 -----------------------------------------
 riscv/riscv.mk.in       |  1 -
 riscv/sim.cc            | 10 ++---
 riscv/sim.h             |  2 +-
 spike_main/spike.cc     | 58 ++++----------------------
 9 files changed, 32 insertions(+), 203 deletions(-)
 delete mode 100644 riscv/mmio_plugin.h

diff --git a/ci-tests/testlib.c b/ci-tests/testlib.c
index 33eaede2e5..6342f9d06d 100644
--- a/ci-tests/testlib.c
+++ b/ci-tests/testlib.c
@@ -28,7 +28,7 @@ int main()
             hartids,
             false,
             4);
-  std::vector<std::pair<reg_t, std::shared_ptr<abstract_device_t>>> plugin_devices;
+  std::vector<const device_factory_t*> plugin_devices;
   std::vector<std::string> htif_args {"pk", "hello"};
   debug_module_config_t dm_config = {
     .progbufsize = 2,
diff --git a/riscv/abstract_device.h b/riscv/abstract_device.h
index 90e2b24cbc..c5c64157ea 100644
--- a/riscv/abstract_device.h
+++ b/riscv/abstract_device.h
@@ -6,6 +6,8 @@
 #include <cstdint>
 #include <cstddef>
 #include <string>
+#include <map>
+#include <stdexcept>
 
 class sim_t;
 
@@ -26,9 +28,18 @@ class device_factory_t {
   virtual ~device_factory_t() {}
 };
 
+// Type for holding all registered MMIO plugins by name.
+using mmio_device_map_t = std::map<std::string, const device_factory_t*>;
+
+mmio_device_map_t& mmio_device_map();
+
 #define REGISTER_DEVICE(name, parse, generate) \
   class name##_factory_t : public device_factory_t { \
   public: \
+  name##_factory_t() { \
+    std::string str(#name); \
+    if (!mmio_device_map().emplace(str, this).second) throw std::runtime_error("Plugin \"" + str + "\" already registered"); \
+  }; \
   name##_t* parse_from_fdt(const void* fdt, const sim_t* sim, reg_t* base) const override { return parse(fdt, sim, base); } \
   std::string generate_dts(const sim_t* sim) const override { return generate(sim); } \
   }; const device_factory_t *name##_factory = new name##_factory_t();
diff --git a/riscv/devices.cc b/riscv/devices.cc
index 81b232d120..2c06f78feb 100644
--- a/riscv/devices.cc
+++ b/riscv/devices.cc
@@ -2,6 +2,12 @@
 #include "mmu.h"
 #include <stdexcept>
 
+mmio_device_map_t& mmio_device_map()
+{
+  static mmio_device_map_t device_map;
+  return device_map;
+}
+
 void bus_t::add_device(reg_t addr, abstract_device_t* dev)
 {
   // Searching devices via lower_bound/upper_bound
@@ -51,46 +57,6 @@ std::pair<reg_t, abstract_device_t*> bus_t::find_device(reg_t addr)
   return std::make_pair(it->first, it->second);
 }
 
-// Type for holding all registered MMIO plugins by name.
-using mmio_plugin_map_t = std::map<std::string, mmio_plugin_t>;
-
-// Simple singleton instance of an mmio_plugin_map_t.
-static mmio_plugin_map_t& mmio_plugin_map()
-{
-  static mmio_plugin_map_t instance;
-  return instance;
-}
-
-void register_mmio_plugin(const char* name_cstr,
-                          const mmio_plugin_t* mmio_plugin)
-{
-  std::string name(name_cstr);
-  if (!mmio_plugin_map().emplace(name, *mmio_plugin).second) {
-    throw std::runtime_error("Plugin \"" + name + "\" already registered!");
-  }
-}
-
-mmio_plugin_device_t::mmio_plugin_device_t(const std::string& name,
-                                           const std::string& args)
-  : plugin(mmio_plugin_map().at(name)), user_data((*plugin.alloc)(args.c_str()))
-{
-}
-
-mmio_plugin_device_t::~mmio_plugin_device_t()
-{
-  (*plugin.dealloc)(user_data);
-}
-
-bool mmio_plugin_device_t::load(reg_t addr, size_t len, uint8_t* bytes)
-{
-  return (*plugin.load)(user_data, addr, len, bytes);
-}
-
-bool mmio_plugin_device_t::store(reg_t addr, size_t len, const uint8_t* bytes)
-{
-  return (*plugin.store)(user_data, addr, len, bytes);
-}
-
 mem_t::mem_t(reg_t size)
   : sz(size)
 {
diff --git a/riscv/devices.h b/riscv/devices.h
index 11cc3479cb..b752a21ecb 100644
--- a/riscv/devices.h
+++ b/riscv/devices.h
@@ -2,7 +2,6 @@
 #define _RISCV_DEVICES_H
 
 #include "decode.h"
-#include "mmio_plugin.h"
 #include "abstract_device.h"
 #include "abstract_interrupt_controller.h"
 #include "platform.h"
@@ -157,19 +156,6 @@ class ns16550_t : public abstract_device_t {
   static const int MAX_BACKOFF = 16;
 };
 
-class mmio_plugin_device_t : public abstract_device_t {
- public:
-  mmio_plugin_device_t(const std::string& name, const std::string& args);
-  virtual ~mmio_plugin_device_t() override;
-
-  virtual bool load(reg_t addr, size_t len, uint8_t* bytes) override;
-  virtual bool store(reg_t addr, size_t len, const uint8_t* bytes) override;
-
- private:
-  mmio_plugin_t plugin;
-  void* user_data;
-};
-
 template<typename T>
 void write_little_endian_reg(T* word, reg_t addr, size_t len, const uint8_t* bytes)
 {
diff --git a/riscv/mmio_plugin.h b/riscv/mmio_plugin.h
deleted file mode 100644
index f14470bf38..0000000000
--- a/riscv/mmio_plugin.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef _RISCV_MMIO_PLUGIN_H
-#define _RISCV_MMIO_PLUGIN_H
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-typedef uint64_t reg_t;
-
-typedef struct {
-  // Allocate user data for an instance of the plugin. The parameter is a simple
-  // c-string containing arguments used to construct the plugin. It returns a
-  // void* to the allocated data.
-  void* (*alloc)(const char*);
-
-  // Load a memory address of the MMIO plugin. The parameters are the user_data
-  // (void*), memory offset (reg_t), number of bytes to load (size_t), and the
-  // buffer into which the loaded data should be written (uint8_t*). Return true
-  // if the load is successful and false otherwise.
-  bool (*load)(void*, reg_t, size_t, uint8_t*);
-
-  // Store some bytes to a memory address of the MMIO plugin. The parameters are
-  // the user_data (void*), memory offset (reg_t), number of bytes to store
-  // (size_t), and the buffer containing the data to be stored (const uint8_t*).
-  // Return true if the store is successful and false otherwise.
-  bool (*store)(void*, reg_t, size_t, const uint8_t*);
-
-  // Deallocate the data allocated during the call to alloc. The parameter is a
-  // pointer to the user data allocated during the call to alloc.
-  void (*dealloc)(void*);
-} mmio_plugin_t;
-
-// Register an mmio plugin with the application. This should be called by
-// plugins as part of their loading process.
-extern void register_mmio_plugin(const char* name_cstr,
-                                 const mmio_plugin_t* mmio_plugin);
-
-#ifdef __cplusplus
-}
-
-#include <string>
-
-// Wrapper around the C plugin API that makes registering a C++ class with
-// correctly formed constructor, load, and store functions easier. The template
-// type should be the type that implements the MMIO plugin interface. Simply
-// make a global mmio_plugin_registration_t and your plugin should register
-// itself with the application when it is loaded because the
-// mmio_plugin_registration_t constructor will be called.
-template <typename T>
-struct mmio_plugin_registration_t
-{
-  static void* alloc(const char* args)
-  {
-    return reinterpret_cast<void*>(new T(std::string(args)));
-  }
-
-  static bool load(void* self, reg_t addr, size_t len, uint8_t* bytes)
-  {
-    return reinterpret_cast<T*>(self)->load(addr, len, bytes);
-  }
-
-  static bool store(void* self, reg_t addr, size_t len, const uint8_t* bytes)
-  {
-    return reinterpret_cast<T*>(self)->store(addr, len, bytes);
-  }
-
-  static void dealloc(void* self)
-  {
-    delete reinterpret_cast<T*>(self);
-  }
-
-  mmio_plugin_registration_t(const std::string& name)
-  {
-    mmio_plugin_t plugin = {
-      mmio_plugin_registration_t<T>::alloc,
-      mmio_plugin_registration_t<T>::load,
-      mmio_plugin_registration_t<T>::store,
-      mmio_plugin_registration_t<T>::dealloc,
-    };
-
-    register_mmio_plugin(name.c_str(), &plugin);
-  }
-};
-#endif // __cplusplus
-
-#endif
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index d82df45e12..1ad8b23b2e 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -35,7 +35,6 @@ riscv_install_hdrs = \
 	isa_parser.h \
 	log_file.h \
 	memtracer.h \
-	mmio_plugin.h \
 	mmu.h \
 	platform.h \
 	processor.h \
diff --git a/riscv/sim.cc b/riscv/sim.cc
index 50dc4f68bd..0c5a7fb299 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -38,7 +38,7 @@ extern device_factory_t* ns16550_factory;
 
 sim_t::sim_t(const cfg_t *cfg, bool halted,
              std::vector<std::pair<reg_t, mem_t*>> mems,
-             std::vector<std::pair<reg_t, std::shared_ptr<abstract_device_t>>> plugin_devices,
+             std::vector<const device_factory_t*> plugin_device_factories,
              const std::vector<std::string>& args,
              const debug_module_config_t &dm_config,
              const char *log_path,
@@ -69,11 +69,6 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
   for (auto& x : mems)
     bus.add_device(x.first, x.second);
 
-  for (auto& x : plugin_devices) {
-    bus.add_device(x.first, x.second.get());
-    devices.push_back(x.second);
-  }
-
   debug_module.add_device(&bus);
 
   socketif = NULL;
@@ -124,6 +119,9 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
     clint_factory, // clint must be element 0
     plic_factory, // plic must be element 1
     ns16550_factory};
+  device_factories.insert(device_factories.end(),
+                          plugin_device_factories.begin(),
+                          plugin_device_factories.end());
 
   // Load dtb_file if provided, otherwise self-generate a dts/dtb
   if (dtb_file) {
diff --git a/riscv/sim.h b/riscv/sim.h
index 1cb0658088..a3445db257 100644
--- a/riscv/sim.h
+++ b/riscv/sim.h
@@ -27,7 +27,7 @@ class sim_t : public htif_t, public simif_t
 public:
   sim_t(const cfg_t *cfg, bool halted,
         std::vector<std::pair<reg_t, mem_t*>> mems,
-        std::vector<std::pair<reg_t, std::shared_ptr<abstract_device_t>>> plugin_devices,
+        std::vector<const device_factory_t*> plugin_device_factories,
         const std::vector<std::string>& args,
         const debug_module_config_t &dm_config, const char *log_path,
         bool dtb_enabled, const char *dtb_file,
diff --git a/spike_main/spike.cc b/spike_main/spike.cc
index f257582ffb..4766f6dad0 100644
--- a/spike_main/spike.cc
+++ b/spike_main/spike.cc
@@ -50,12 +50,7 @@ static void help(int exit_code = 1)
   fprintf(stderr, "  --l2=<S>:<W>:<B>        B both powers of 2).\n");
   fprintf(stderr, "  --big-endian          Use a big-endian memory system.\n");
   fprintf(stderr, "  --misaligned          Support misaligned memory accesses\n");
-  fprintf(stderr, "  --device=<P,B,A>      Attach MMIO plugin device from an --extlib library\n");
-  fprintf(stderr, "                          P -- Name of the MMIO plugin\n");
-  fprintf(stderr, "                          B -- Base memory address of the device\n");
-  fprintf(stderr, "                          A -- String arguments to pass to the plugin\n");
-  fprintf(stderr, "                          This flag can be used multiple times.\n");
-  fprintf(stderr, "                          The extlib flag for the library must come first.\n");
+  fprintf(stderr, "  --device=<name>       Attach MMIO plugin device from an --extlib library\n");
   fprintf(stderr, "  --log-cache-miss      Generate a log of cache miss\n");
   fprintf(stderr, "  --log-commits         Generate a log of commits info\n");
   fprintf(stderr, "  --extension=<name>    Specify RoCC Extension\n");
@@ -336,7 +331,7 @@ int main(int argc, char** argv)
   bool dtb_enabled = true;
   const char* kernel = NULL;
   reg_t kernel_offset, kernel_size;
-  std::vector<std::pair<reg_t, std::shared_ptr<abstract_device_t>>> plugin_devices;
+  std::vector<const device_factory_t*> plugin_device_factories;
   std::unique_ptr<icache_sim_t> ic;
   std::unique_ptr<dcache_sim_t> dc;
   std::unique_ptr<cache_sim_t> l2;
@@ -376,47 +371,12 @@ int main(int argc, char** argv)
             /*default_real_time_clint=*/false,
             /*default_trigger_count=*/4);
 
-  auto const device_parser = [&plugin_devices](const char *s) {
-    const std::string str(s);
-    std::istringstream stream(str);
-
-    // We are parsing a string like name,base,args.
-
-    // Parse the name, which is simply all of the characters leading up to the
-    // first comma. The validity of the plugin name will be checked later.
-    std::string name;
-    std::getline(stream, name, ',');
-    if (name.empty()) {
-      throw std::runtime_error("Plugin name is empty.");
-    }
-
-    // Parse the base address. First, get all of the characters up to the next
-    // comma (or up to the end of the string if there is no comma). Then try to
-    // parse that string as an integer according to the rules of strtoull. It
-    // could be in decimal, hex, or octal. Fail if we were able to parse a
-    // number but there were garbage characters after the valid number. We must
-    // consume the entire string between the commas.
-    std::string base_str;
-    std::getline(stream, base_str, ',');
-    if (base_str.empty()) {
-      throw std::runtime_error("Device base address is empty.");
-    }
-    char* end;
-    reg_t base = static_cast<reg_t>(strtoull(base_str.c_str(), &end, 0));
-    if (end != &*base_str.cend()) {
-      throw std::runtime_error("Error parsing device base address.");
-    }
-
-    // The remainder of the string is the arguments. We could use getline, but
-    // that could ignore newline characters in the arguments. That should be
-    // rare and discouraged, but handle it here anyway with this weird in_avail
-    // technique. The arguments are optional, so if there were no arguments
-    // specified we could end up with an empty string here. That's okay.
-    auto avail = stream.rdbuf()->in_avail();
-    std::string args(avail, '\0');
-    stream.readsome(&args[0], avail);
-
-    plugin_devices.emplace_back(base, std::make_shared<mmio_plugin_device_t>(name, args));
+  auto const device_parser = [&plugin_device_factories](const char *s) {
+    const std::string name(s);
+    if (name.empty()) throw std::runtime_error("Plugin name is empty.");
+    auto it = mmio_device_map().find(name);
+    if (it == mmio_device_map().end()) throw std::runtime_error("Plugin \"" + name + "\" not found in loaded extlibs.");
+    plugin_device_factories.push_back(it->second);
   };
 
   option_parser_t parser;
@@ -564,7 +524,7 @@ int main(int argc, char** argv)
   }
 
   sim_t s(&cfg, halted,
-      mems, plugin_devices, htif_args, dm_config, log_path, dtb_enabled, dtb_file,
+      mems, plugin_device_factories, htif_args, dm_config, log_path, dtb_enabled, dtb_file,
       socket,
       cmd_file);
   std::unique_ptr<remote_bitbang_t> remote_bitbang((remote_bitbang_t *) NULL);

From b87c6e64d0dba9edf7dc070f9e4a29016641c3a1 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Wed, 7 Jun 2023 10:33:38 -0700
Subject: [PATCH 109/110] debug: Remove debug_module_t::add_device, its
 redundant

---
 riscv/debug_module.cc | 4 ----
 riscv/debug_module.h  | 2 --
 riscv/sim.cc          | 2 +-
 3 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/riscv/debug_module.cc b/riscv/debug_module.cc
index 27dbe66ecb..0f75c5e3df 100644
--- a/riscv/debug_module.cc
+++ b/riscv/debug_module.cc
@@ -118,10 +118,6 @@ void debug_module_t::reset()
   challenge = random();
 }
 
-void debug_module_t::add_device(bus_t *bus) {
-  bus->add_device(DEBUG_START, this);
-}
-
 bool debug_module_t::load(reg_t addr, size_t len, uint8_t* bytes)
 {
   addr = DEBUG_START + addr;
diff --git a/riscv/debug_module.h b/riscv/debug_module.h
index 0a62d77585..518f119df6 100644
--- a/riscv/debug_module.h
+++ b/riscv/debug_module.h
@@ -113,8 +113,6 @@ class debug_module_t : public abstract_device_t
     debug_module_t(simif_t *sim, const debug_module_config_t &config);
     ~debug_module_t();
 
-    void add_device(bus_t *bus);
-
     bool load(reg_t addr, size_t len, uint8_t* bytes);
     bool store(reg_t addr, size_t len, const uint8_t* bytes);
 
diff --git a/riscv/sim.cc b/riscv/sim.cc
index 0c5a7fb299..0779b954e4 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -69,7 +69,7 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
   for (auto& x : mems)
     bus.add_device(x.first, x.second);
 
-  debug_module.add_device(&bus);
+  bus.add_device(DEBUG_START, &debug_module);
 
   socketif = NULL;
 #ifdef HAVE_BOOST_ASIO

From 7ac808ee1b9c04c73899de89f550744d5963e18b Mon Sep 17 00:00:00 2001
From: "demin.han" <demin.han@starfivetech.com>
Date: Wed, 21 Jun 2023 11:05:41 +0800
Subject: [PATCH 110/110] Remove duplicate compile options

---
 Makefile.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.in b/Makefile.in
index 01d7baca41..b5013bde79 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -98,7 +98,7 @@ default-CXXFLAGS := $(default-CFLAGS) -std=c++17
 
 mcppbs-CPPFLAGS := @CPPFLAGS@
 mcppbs-CFLAGS   := $(default-CFLAGS) @CFLAGS@
-mcppbs-CXXFLAGS := $(mcppbs-CFLAGS) $(default-CXXFLAGS) @CXXFLAGS@
+mcppbs-CXXFLAGS := $(default-CXXFLAGS) @CXXFLAGS@
 
 CC            := @CC@
 CXX           := @CXX@