Skip to content

Commit

Permalink
[PMP] add support for NA4 and NAPOT modes (#566)
Browse files Browse the repository at this point in the history
  • Loading branch information
stnolting authored Apr 1, 2023
2 parents a3eb585 + a8e54c8 commit 3214f30
Show file tree
Hide file tree
Showing 11 changed files with 354 additions and 391 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ mimpid = 0x01040312 => Version 01.04.03.12 => v1.4.3.12

| Date (*dd.mm.yyyy*) | Version | Comment |
|:-------------------:|:-------:|:--------|
| 01.04.2023 | 1.8.3.1 | :sparkles: add full `NA4` and `NAPOT` support to the (now) RISC-V-compatible **physical memory protection (PMP)**; [#566](https://github.com/stnolting/neorv32/pull/566) |
| 31.03.2023 | [**:rocket:1.8.3**](https://github.com/stnolting/neorv32/releases/tag/v1.8.3) | **New release** |
| 29.03.2023 | 1.8.2.9 | :warning: remove `CPU_EXTENSION_RISCV_Zicsr` generic - `Zicsr` ISA extension is always enabled; optimize bus switch; VHDL code cleanups; [#562](https://github.com/stnolting/neorv32/pull/562) |
| 25.03.2023 | 1.8.2.8 | :test_tube: add configurable data cache (**dCACHE**); [#560](https://github.com/stnolting/neorv32/pull/560) |
Expand Down
13 changes: 4 additions & 9 deletions docs/datasheet/cpu.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,6 @@ instruction exception (-> <<_full_virtualization>>).

**Incompatibility Issues and Limitations**

.Physical Memory Protection (PMP)
[WARNING]
The RISC-V-compatible NEORV32 <<_machine_physical_memory_protection_csrs>> only implements the **TOR**
(top of region) mode and only up to 16 PMP regions.

.No Hardware Support of Misaligned Memory Accesses
[IMPORTANT]
The CPU does not support resolving unaligned memory access by the hardware (this is not a
Expand Down Expand Up @@ -561,14 +556,14 @@ to the RISC-V Privileged Architecture Specifications. In general, the PMP can **
which by default has none, and can **revoke permissions from M-mode**, which by default has full permissions.
The PMP is configured via the <<_machine_physical_memory_protection_csrs>>.

[IMPORTANT]
The NEORV32 PMP only supports **TOR** (top of region) mode, which basically is a "base-and-bound" concept, and only
up to 16 PMP regions.

.PMP Rules when in Debug Mode
[NOTE]
When in debug-mode all PMP rules are ignored making the debugger have maximum access rights.

[IMPORTANT]
Instruction fetches are also triggered when denied by a certain PMP rule. However, the fetched instruction(s)
will not be executed and will not change CPU core state to preserve memory access protection.


==== `Sdext` ISA Extension

Expand Down
169 changes: 83 additions & 86 deletions docs/datasheet/cpu_csr.adoc

Large diffs are not rendered by default.

204 changes: 104 additions & 100 deletions rtl/core/neorv32_cpu_bus.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,9 @@ architecture neorv32_cpu_bus_rtl of neorv32_cpu_bus is
constant pmp_cfg_ah_c : natural := 4; -- mode bit high
constant pmp_cfg_l_c : natural := 7; -- locked entry

-- PMP minimal granularity --
constant pmp_lsb_c : natural := index_size_f(PMP_MIN_GRANULARITY); -- min = 2
-- PMP helpers --
constant pmp_lsb_c : natural := index_size_f(PMP_MIN_GRANULARITY); -- min = 2
constant pmp_zero_c : std_ulogic_vector(XLEN-1 downto pmp_lsb_c) := (others => '0');

-- misc --
signal data_sign : std_ulogic; -- signed load
Expand All @@ -110,21 +111,28 @@ architecture neorv32_cpu_bus_rtl of neorv32_cpu_bus is
signal arbiter : bus_arbiter_t;

-- physical memory protection --
type pmp_mask_t is array (0 to PMP_NUM_REGIONS-1) of std_ulogic_vector(XLEN-1 downto pmp_lsb_c);
type pmp_t is record
i_cmp_mm : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
i_cmp_ge : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
i_cmp_lt : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
d_cmp_mm : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
d_cmp_ge : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
d_cmp_lt : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
i_match : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
d_match : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
perm_ex : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
perm_rd : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
perm_wr : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
fail_ex : std_ulogic_vector(PMP_NUM_REGIONS downto 0);
fail_rd : std_ulogic_vector(PMP_NUM_REGIONS downto 0);
fail_wr : std_ulogic_vector(PMP_NUM_REGIONS downto 0);
if_fault : std_ulogic;
ld_fault : std_ulogic;
st_fault : std_ulogic;
end record;
signal pmp : pmp_t;
signal pmp_mask : pmp_mask_t;
signal pmp : pmp_t;

-- pmp faults --
signal if_pmp_fault : std_ulogic; -- pmp instruction access fault
Expand Down Expand Up @@ -380,8 +388,7 @@ begin
arbiter.pmp_r_err <= '0';
arbiter.pmp_w_err <= '0';
elsif rising_edge(clk_i) then
arbiter.pmp_r_err <= ld_pmp_fault;
arbiter.pmp_w_err <= st_pmp_fault;
-- arbiter --
if (arbiter.pend = '0') then -- idle
if (ctrl_i.bus_req = '1') then -- start bus access
arbiter.pend <= '1';
Expand All @@ -399,6 +406,11 @@ begin
arbiter.pend <= '0';
end if;
end if;
-- PMP error --
if (ctrl_i.bus_mo_we = '1') then -- sample PMP errors only once
arbiter.pmp_r_err <= ld_pmp_fault;
arbiter.pmp_w_err <= st_pmp_fault;
end if;
end if;
end process data_access_arbiter;

Expand All @@ -421,111 +433,103 @@ begin
-- RISC-V Physical Memory Protection (PMP) ------------------------------------------------
-- -------------------------------------------------------------------------------------------

-- check address --
pmp_check_address: process(fetch_pc_i, addr_i, pmp_addr_i)
begin
for r in 0 to PMP_NUM_REGIONS-1 loop
if (r = 0) then -- first entry: use ZERO as base and current entry as bound
pmp.i_cmp_ge(r) <= '1'; -- address is always greater than or equal to zero
pmp.i_cmp_lt(r) <= '0'; -- unused
pmp.d_cmp_ge(r) <= '1'; -- address is always greater than or equal to zero
pmp.d_cmp_lt(r) <= '0'; -- unused
else -- use previous entry as base and current entry as bound
pmp.i_cmp_ge(r) <= bool_to_ulogic_f(unsigned(fetch_pc_i(XLEN-1 downto pmp_lsb_c)) >= unsigned(pmp_addr_i(r-1)(XLEN-1 downto pmp_lsb_c)));
pmp.i_cmp_lt(r) <= bool_to_ulogic_f(unsigned(fetch_pc_i(XLEN-1 downto pmp_lsb_c)) < unsigned(pmp_addr_i(r-0)(XLEN-1 downto pmp_lsb_c)));
pmp.d_cmp_ge(r) <= bool_to_ulogic_f(unsigned( addr_i(XLEN-1 downto pmp_lsb_c)) >= unsigned(pmp_addr_i(r-1)(XLEN-1 downto pmp_lsb_c)));
pmp.d_cmp_lt(r) <= bool_to_ulogic_f(unsigned( addr_i(XLEN-1 downto pmp_lsb_c)) < unsigned(pmp_addr_i(r-0)(XLEN-1 downto pmp_lsb_c)));
end if;
end loop; -- r
end process pmp_check_address;


-- check mode --
pmp_check_mode: process(pmp_ctrl_i, pmp)
begin
for r in 0 to PMP_NUM_REGIONS-1 loop
if (pmp_ctrl_i(r)(pmp_cfg_ah_c downto pmp_cfg_al_c) = pmp_mode_tor_c) then -- TOR mode
if (r < (PMP_NUM_REGIONS-1)) then
-- this saves a LOT of comparators --
pmp.i_match(r) <= pmp.i_cmp_ge(r) and (not pmp.i_cmp_ge(r+1));
pmp.d_match(r) <= pmp.d_cmp_ge(r) and (not pmp.d_cmp_ge(r+1));
else -- very last entry
pmp.i_match(r) <= pmp.i_cmp_ge(r) and pmp.i_cmp_lt(r);
pmp.d_match(r) <= pmp.d_cmp_ge(r) and pmp.d_cmp_lt(r);
-- compute address masks for NAPOT modes (iterative!) --
pmp_masking_gen:
for r in 0 to PMP_NUM_REGIONS-1 generate
pmp_masking: process(rstn_i, clk_i)
begin
if (rstn_i = '0') then
pmp_mask(r) <= (others => '0');
elsif rising_edge(clk_i) then -- address mask computation has a latency of max 32 cycles
if (pmp_ctrl_i(r)(pmp_cfg_al_c) = '1') then -- NAPOT (or TOR, but that's irrelevant here)
pmp_mask(r)(pmp_lsb_c) <= '0';
for i in pmp_lsb_c+1 to XLEN-1 loop
pmp_mask(r)(i) <= pmp_mask(r)(i-1) or (not pmp_addr_i(r)(i-1)); -- skip address byte offset
end loop; -- i
else -- NA4
pmp_mask(r) <= (others => '1');
end if;
else -- entry disabled
pmp.i_match(r) <= '0';
pmp.d_match(r) <= '0';
end if;
end loop; -- r
end process pmp_check_mode;
end process pmp_masking;
end generate;


-- check permission --
pmp_check_permission: process(ctrl_i, pmp_ctrl_i)
begin
for r in 0 to PMP_NUM_REGIONS-1 loop
-- check address --
pmp_check_address:
for r in 0 to PMP_NUM_REGIONS-1 generate
-- NA4 and NAPOT --
pmp.i_cmp_mm(r) <= '1' when ((fetch_pc_i(XLEN-1 downto pmp_lsb_c) and pmp_mask(r)) = (pmp_addr_i(r)(XLEN-1 downto pmp_lsb_c) and pmp_mask(r))) else '0';
pmp.d_cmp_mm(r) <= '1' when (( addr_i(XLEN-1 downto pmp_lsb_c) and pmp_mask(r)) = (pmp_addr_i(r)(XLEN-1 downto pmp_lsb_c) and pmp_mask(r))) else '0';
-- TOR region 0 --
pmp_check_address_r0:
if (r = 0) generate -- first entry: use ZERO as base and current entry as bound
pmp.i_cmp_ge(r) <= '1'; -- address is always greater than or equal to zero
pmp.i_cmp_lt(r) <= '0'; -- unused
pmp.d_cmp_ge(r) <= '1'; -- address is always greater than or equal to zero
pmp.d_cmp_lt(r) <= '0'; -- unused
end generate;
-- TOR region any --
pmp_check_address_rany:
if (r > 0) generate -- use previous entry as base and current entry as bound
pmp.i_cmp_ge(r) <= '1' when (unsigned(fetch_pc_i(XLEN-1 downto pmp_lsb_c)) >= unsigned(pmp_addr_i(r-1)(XLEN-1 downto pmp_lsb_c))) else '0';
pmp.i_cmp_lt(r) <= '1' when (unsigned(fetch_pc_i(XLEN-1 downto pmp_lsb_c)) < unsigned(pmp_addr_i(r )(XLEN-1 downto pmp_lsb_c))) else '0';
pmp.d_cmp_ge(r) <= '1' when (unsigned( addr_i(XLEN-1 downto pmp_lsb_c)) >= unsigned(pmp_addr_i(r-1)(XLEN-1 downto pmp_lsb_c))) else '0';
pmp.d_cmp_lt(r) <= '1' when (unsigned( addr_i(XLEN-1 downto pmp_lsb_c)) < unsigned(pmp_addr_i(r )(XLEN-1 downto pmp_lsb_c))) else '0';
end generate;
end generate;

-- instruction fetch access --
if (ctrl_i.cpu_priv = priv_mode_m_c) then -- M mode: always allow if lock bit not set, otherwise check permission
pmp.perm_ex(r) <= (not pmp_ctrl_i(r)(pmp_cfg_l_c)) or pmp_ctrl_i(r)(pmp_cfg_x_c);
else -- U mode: always check permission
pmp.perm_ex(r) <= pmp_ctrl_i(r)(pmp_cfg_x_c);
end if;

-- load/store accesses from M mod (can also use U mode's permissions if MSTATUS.MPRV is set) --
if (ctrl_i.bus_priv = priv_mode_m_c) then -- M mode: always allow if lock bit not set, otherwise check permission
pmp.perm_rd(r) <= (not pmp_ctrl_i(r)(pmp_cfg_l_c)) or pmp_ctrl_i(r)(pmp_cfg_r_c);
pmp.perm_wr(r) <= (not pmp_ctrl_i(r)(pmp_cfg_l_c)) or pmp_ctrl_i(r)(pmp_cfg_w_c);
else -- U mode: always check permission
pmp.perm_rd(r) <= pmp_ctrl_i(r)(pmp_cfg_r_c);
pmp.perm_wr(r) <= pmp_ctrl_i(r)(pmp_cfg_w_c);
end if;
-- check mode --
pmp_check_mode_gen:
for r in 0 to PMP_NUM_REGIONS-1 generate
pmp_check_mode: process(pmp_ctrl_i, pmp)
begin
case pmp_ctrl_i(r)(pmp_cfg_ah_c downto pmp_cfg_al_c) is
when pmp_mode_off_c => -- entry disabled
pmp.i_match(r) <= '0';
pmp.d_match(r) <= '0';
when pmp_mode_tor_c => -- top of region
if (r = (PMP_NUM_REGIONS-1)) then -- very last entry
pmp.i_match(r) <= pmp.i_cmp_ge(r) and pmp.i_cmp_lt(r);
pmp.d_match(r) <= pmp.d_cmp_ge(r) and pmp.d_cmp_lt(r);
else -- this saves a LOT of comparators
pmp.i_match(r) <= pmp.i_cmp_ge(r) and (not pmp.i_cmp_ge(r+1));
pmp.d_match(r) <= pmp.d_cmp_ge(r) and (not pmp.d_cmp_ge(r+1));
end if;
when others => -- naturally-aligned region
pmp.i_match(r) <= pmp.i_cmp_mm(r);
pmp.d_match(r) <= pmp.d_cmp_mm(r);
end case;
end process pmp_check_mode;
end generate;

end loop; -- r
end process pmp_check_permission;

-- check permission --
-- M mode: always allow if lock bit not set, otherwise check permission
pmp_check_permission:
for r in 0 to PMP_NUM_REGIONS-1 generate
pmp.perm_ex(r) <= pmp_ctrl_i(r)(pmp_cfg_x_c) or (not pmp_ctrl_i(r)(pmp_cfg_l_c)) when (ctrl_i.cpu_priv = priv_mode_m_c) else pmp_ctrl_i(r)(pmp_cfg_x_c);
pmp.perm_rd(r) <= pmp_ctrl_i(r)(pmp_cfg_r_c) or (not pmp_ctrl_i(r)(pmp_cfg_l_c)) when (ctrl_i.bus_priv = priv_mode_m_c) else pmp_ctrl_i(r)(pmp_cfg_r_c);
pmp.perm_wr(r) <= pmp_ctrl_i(r)(pmp_cfg_w_c) or (not pmp_ctrl_i(r)(pmp_cfg_l_c)) when (ctrl_i.bus_priv = priv_mode_m_c) else pmp_ctrl_i(r)(pmp_cfg_w_c);
end generate;


-- check for access fault (using static prioritization) --
pmp_check_fault: process(ctrl_i, pmp)
variable tmp_if_v, tmp_ld_v, tmp_st_v : std_ulogic_vector(PMP_NUM_REGIONS downto 0);
begin
-- > This is a *structural* description of a prioritization logic (a multiplexer chain).
-- > I prefer this style as I do not like using a loop with 'exit' - and I also think this style might be smaller
-- > and faster (could use the carry chain?!) as the synthesizer has less freedom doing what *I* want. ;)
tmp_if_v(PMP_NUM_REGIONS) := bool_to_ulogic_f(ctrl_i.cpu_priv /= priv_mode_m_c); -- default: fault if U mode
tmp_ld_v(PMP_NUM_REGIONS) := bool_to_ulogic_f(ctrl_i.bus_priv /= priv_mode_m_c); -- default: fault if U mode
tmp_st_v(PMP_NUM_REGIONS) := bool_to_ulogic_f(ctrl_i.bus_priv /= priv_mode_m_c); -- default: fault if U mode

for r in PMP_NUM_REGIONS-1 downto 0 loop -- start with lowest priority
-- instruction fetch access --
if (pmp.i_match(r) = '1') then -- address matches region r
tmp_if_v(r) := not pmp.perm_ex(r); -- fault if no execute permission
else
tmp_if_v(r) := tmp_if_v(r+1);
end if;
-- data load/store access --
if (pmp.d_match(r) = '1') then -- address matches region r
tmp_ld_v(r) := not pmp.perm_rd(r); -- fault if no read permission
tmp_st_v(r) := not pmp.perm_wr(r); -- fault if no write permission
else
tmp_ld_v(r) := tmp_ld_v(r+1);
tmp_st_v(r) := tmp_st_v(r+1);
end if;
end loop; -- r
pmp.if_fault <= tmp_if_v(0);
pmp.ld_fault <= tmp_ld_v(0);
pmp.st_fault <= tmp_st_v(0);

-- > this is the behavioral version of the code above (instruction fetch access)
-- pmp.if_fault <= bool_to_ulogic_f(ctrl_i.cpu_priv /= priv_mode_m_c); -- default: fault if U mode
-- for r in 0 to PMP_NUM_REGIONS-1 loop
-- if (pmp.i_match(r) = '1') then
-- pmp.if_fault <= not pmp.perm_ex(r); -- fault if no execute permission
-- exit;
-- end if;
-- end loop; -- r
end process pmp_check_fault;
-- default: fault if not M-mode --
pmp.fail_ex(PMP_NUM_REGIONS) <= '1' when (ctrl_i.cpu_priv /= priv_mode_m_c) else '0';
pmp.fail_rd(PMP_NUM_REGIONS) <= '1' when (ctrl_i.bus_priv /= priv_mode_m_c) else '0';
pmp.fail_wr(PMP_NUM_REGIONS) <= '1' when (ctrl_i.bus_priv /= priv_mode_m_c) else '0';
-- This is a *structural* description of a prioritization logic implemented as a multiplexer chain. --
pmp_chech_fault:
for r in PMP_NUM_REGIONS-1 downto 0 generate -- start with lowest priority
pmp.fail_ex(r) <= not pmp.perm_ex(r) when (pmp.i_match(r) = '1') else pmp.fail_ex(r+1);
pmp.fail_rd(r) <= not pmp.perm_rd(r) when (pmp.d_match(r) = '1') else pmp.fail_rd(r+1);
pmp.fail_wr(r) <= not pmp.perm_wr(r) when (pmp.d_match(r) = '1') else pmp.fail_wr(r+1);
end generate;
pmp.if_fault <= pmp.fail_ex(0);
pmp.ld_fault <= pmp.fail_rd(0);
pmp.st_fault <= pmp.fail_wr(0);


-- final PMP access fault signals (ignored when in debug mode) --
if_pmp_fault <= '1' when (pmp.if_fault = '1') and (PMP_NUM_REGIONS > 0) and (ctrl_i.cpu_debug = '0') else '0';
Expand Down
Loading

0 comments on commit 3214f30

Please sign in to comment.