diff --git a/doc/project_mgt_hw.md b/doc/project_mgt_hw.md index 3ef5f11..8004060 100644 --- a/doc/project_mgt_hw.md +++ b/doc/project_mgt_hw.md @@ -17,8 +17,9 @@ Mesure et amélioration des performances - [X] Print et save des registres CSRs pour chaque test, garde la trace des performances dans Git -- [X] IP point de mesure des différents bus en bandwidth -- [X] CPI measure in benchmark +- [X] Put in place perf. profiling + - [X] IP point de mesure des différents bus en bandwidth + - [X] CPI measure in benchmark - [X] Augmenter le nombre d’OR max de dCache - [X] Prefetch read request - [X] Optimize write pusher to save a cycle @@ -28,7 +29,7 @@ Mesure et amélioration des performances - [X] Enhance completion in OoO - [X] Save a cycle on RD write in Memfy - [X] Pending flag to deassert on completion if or=1 -- [ ] Detect address collision in memfy for better performance +- [-] Detect address collision in memfy for better performance - [ ] Preload jal even if processing is busy - [ ] Branch prediction - [ ] https://en.wikipedia.org/wiki/Branch_target_predictor @@ -66,36 +67,33 @@ Misc. - [ ] Random peripheral - [ ] OoO read: miss could be stacked and served later waiting for cache fill and continue reading the next address - [ ] Support cache disable in testbench -- [ ] Put in place profiling - [ ] Print des tests qui ne marchent pas dans le bash et svut_h.sv pour verilator - [ ] 64 bits support - [ ] Atomic operations - [ ] Zicond - [ ] RVV for machine learning - [ ] Support privileged instructions, supervisor mode & user mode - - voir les CSRs dans la privileged mode, implementer les compteurs par mode - - https://danielmangum.com/posts/risc-v-bytes-privilege-levels/ - - https://mobile.twitter.com/hasheddan/status/1514581031092899843?s=12&t=MMNTY_iRC48CjykLQBdTkQ - - https://man7.org/linux/man-pages/man2/syscall.2.html - - https://www.youtube.com/watch?app=desktop&v=1-8oYzL_Thk - - https://jborza.com/emulation/2021/04/22/ecalls-and-syscalls.html + - Smcntrpmf extension + - voir les CSRs dans la privileged mode, implementer les compteurs par mode + - https://danielmangum.com/posts/risc-v-bytes-privilege-levels/ + - https://mobile.twitter.com/hasheddan/status/1514581031092899843?s=12&t=MMNTY_iRC48CjykLQBdTkQ + - https://man7.org/linux/man-pages/man2/syscall.2.html + - https://www.youtube.com/watch?app=desktop&v=1-8oYzL_Thk + - https://jborza.com/emulation/2021/04/22/ecalls-and-syscalls.html - [ ] Support MMU extension - [ ] Debug Support / JTAG interface / GDB Usage / OpenOCD - - https://tomverbeure.github.io/2021/07/18/VexRiscv-OpenOCD-and-Traps.html - - https://tomverbeure.github.io/2022/02/20/GDBWave-Post-Simulation-RISCV-SW-Debugging.html - - https://github.com/BLangOS/VexRiscV_with_HW-GDB_Server + - https://tomverbeure.github.io/2021/07/18/VexRiscv-OpenOCD-and-Traps.html + - https://tomverbeure.github.io/2022/02/20/GDBWave-Post-Simulation-RISCV-SW-Debugging.html + - https://github.com/BLangOS/VexRiscV_with_HW-GDB_Server - [ ] Support PLIC (only for multi-core) - [ ] Support CLIC controller - [ ] UART: Support 9/10 bits & parity - [ ] Removed the 2 LSBs in instruction cache while always 2'b11 (6.25% saving) -- [ ] Out-of-order execution - [ ] Create app per benchmark - [ ] Revoir tous les paramètres de chaque instance et les documenter -- [ ] Drop lxt2 waveform - [ ] Manage exceptions on r/w response - [ ] SV Testbench: Assert flush without ARVALID - [ ] New cache associativity (2 / 4 / 8 ways configurable) -- [ ] Super scaler arch - [ ] Move LUI into processing to prepare future extension support - [ ] Read ASM to be sure its used for processing and not control - [ ] Benchmark waveform doesn’t reveal high usage @@ -141,6 +139,7 @@ https://www.youtube.com/channel/UCPSsA8oxlSBjidJsSPdpjsQ/videos Verification/Validation: +- [ ] Drop lxt2 waveform - [ ] Update synthesis flow - [ ] Standard cells library for Yosys - [ ] https://github.com/dpretet/ascend-freepdk45/tree/master/lib @@ -168,6 +167,7 @@ Hardware Test: # Ideas / Applications - [ ] Next CPU architecture: + - Out-of-order execution - super scalar architecture - SIMD architecture - Vector architecture diff --git a/rtl/friscv_cache_blocks.sv b/rtl/friscv_cache_blocks.sv index 4e083d6..af49a3a 100644 --- a/rtl/friscv_cache_blocks.sv +++ b/rtl/friscv_cache_blocks.sv @@ -22,7 +22,7 @@ module friscv_cache_blocks #( // Name used for tracer file name parameter NAME = "blocks", - // data length, ILEN or XLEN if is a i$ or d$ + // data length, ILEN for i$ or XLEN for d$ parameter WLEN = 32, // Address bus width parameter ADDR_W = 32, @@ -61,7 +61,7 @@ module friscv_cache_blocks ////////////////////////////////////////////////////////////////////////// // Offset part into address to index a DWORD or QWORD - localparam OFFSET_IX = (WLEN==32) ? 2 : 3; + localparam OFFSET_IX = (WLEN==32) ? 2 : 3; // 32 or 64 bits support localparam OFFSET_W = $clog2(CACHE_BLOCK_W/WLEN); // Index part into address value, to parse the cache blocks @@ -83,40 +83,36 @@ module friscv_cache_blocks // Logic declaration ////////////////////////////////////////////////////////////////////////// - // signals used to parse the RAM/regfiles - logic wen; - logic [INDEX_W -1:0] windex; - logic [CACHE_BLOCK_W -1:0] wdata; - logic [CACHE_BLOCK_W/8 -1:0] wstrb; - logic [TAG_W -1:0] wtag; - logic [INDEX_W -1:0] rindex; - // extracted from the write interface logic [INDEX_W -1:0] p1_windex; logic [TAG_W -1:0] p1_wtag; - logic [FULL_BLOCK_W -1:0] p1_wblock; + logic [INDEX_W -1:0] p2_windex; logic [TAG_W -1:0] p2_wtag; - logic [FULL_BLOCK_W -1:0] p2_wblock; + + logic [INDEX_W -1:0] p1_index; + logic [INDEX_W -1:0] p2_index; // extracted from the read interface logic [INDEX_W -1:0] p1_rindex; logic [TAG_W -1:0] p1_rtag; - logic [CACHE_BLOCK_W -1:0] p1_rblock; - logic p1_rset; logic [OFFSET_W -1:0] p1_roffset; + logic [INDEX_W -1:0] p2_rindex; logic [TAG_W -1:0] p2_rtag; - logic [CACHE_BLOCK_W -1:0] p2_rblock; - logic p2_rset; logic [OFFSET_W -1:0] p2_roffset; + // extracted from the blocks - logic [TAG_W -1:0] rblock_tag; - logic [CACHE_BLOCK_W -1:0] rblock_data; - logic rblock_set; + logic [TAG_W -1:0] p1_rblock_tag; + logic [CACHE_BLOCK_W -1:0] p1_rblock_data; + logic p1_rblock_set; + + logic [TAG_W -1:0] p2_rblock_tag; + logic [CACHE_BLOCK_W -1:0] p2_rblock_data; + logic p2_rblock_set; + genvar i; - integer j; genvar k; // Tracer setup @@ -133,19 +129,37 @@ module friscv_cache_blocks `define trace_read(PNUM) \ $fwrite(f, "@ %0t: Port PNUM read @ 0x%x\n", $realtime, p``PNUM``_raddr);\ $fwrite(f, " Block:\n");\ - $fwrite(f, " - set 0x%x\n", rblock_set);\ - $fwrite(f, " - tag 0x%x\n", rblock_tag);\ - $fwrite(f, " - data 0x%x\n", rblock_data);\ + $fwrite(f, " - set 0x%x\n", p``PNUM``_rblock_set);\ + $fwrite(f, " - tag 0x%x\n", p``PNUM``_rblock_tag);\ + $fwrite(f, " - data 0x%x\n", p``PNUM``_rblock_data);\ $fwrite(f, " Request:\n");\ - if (rblock_set && p``PNUM``_rtag==rblock_tag)\ + if (p``PNUM``_rblock_set && p``PNUM``_rtag==p``PNUM``_rblock_tag)\ $fwrite(f, " - hit\n");\ else\ $fwrite(f, " - miss\n");\ $fwrite(f, " - index 0x%x\n", p``PNUM``_rindex);\ $fwrite(f, " - tag 0x%x\n", p``PNUM``_rtag);\ $fwrite(f, " - offset 0x%x\n", p``PNUM``_roffset);\ - $fwrite(f, " - data 0x%x\n", rblock_data[p``PNUM``_roffset*WLEN+:WLEN]); + $fwrite(f, " - data 0x%x\n", p``PNUM``_rblock_data[p``PNUM``_roffset*WLEN+:WLEN]); + `ifdef TRACE_BLOCKS + always @ (posedge aclk) begin + if (p1_wen) begin + $fwrite(f, "@ %0t: Port 1 write @ 0x%x\n", $realtime, p1_waddr); + $fwrite(f, " - index 0x%x\n", p1_windex); + $fwrite(f, " - tag 0x%x\n", p1_wtag); + $fwrite(f, " - data 0x%x\n", p1_wdata); + $fwrite(f, " - strb 0x%x\n", p1_wstrb); + end + if (p2_wen) begin + $fwrite(f, "@ %0t: Port 2 write @ 0x%x\n", $realtime, p2_waddr); + $fwrite(f, " - index 0x%x\n", p2_windex); + $fwrite(f, " - tag 0x%x\n", p2_wtag); + $fwrite(f, " - data 0x%x\n", p2_wdata); + $fwrite(f, " - strb 0x%x\n", p2_wstrb); + end + end + `endif ////////////////////////////////////////////////////////////////////////// // Address parsing and cache line construction @@ -159,31 +173,14 @@ module friscv_cache_blocks assign p1_wtag = p1_waddr[TAG_IX+:TAG_W]; assign p2_wtag = p2_waddr[TAG_IX+:TAG_W]; - assign wen = p1_wen | p2_wen; - assign windex = (p1_wen) ? p1_windex : p2_windex; - assign wtag = (p1_wen) ? p1_wtag : p2_wtag; - assign wstrb = (p1_wen) ? p1_wstrb : p2_wstrb; - assign wdata = (p1_wen) ? p1_wdata : p2_wdata; - - `ifdef TRACE_BLOCKS - always @ (posedge aclk) begin - if (wen) begin - if (p1_wen) $fwrite(f, "@ %0t: Port 1 write @ 0x%x\n", $realtime, p1_waddr); - else $fwrite(f, "@ %0t: Port 2 write @ 0x%x\n", $realtime, p2_waddr); - $fwrite(f, " - index 0x%x\n", windex); - $fwrite(f, " - tag 0x%x\n", wtag); - $fwrite(f, " - data 0x%x\n", wdata); - $fwrite(f, " - strb 0x%x\n", wstrb); - end - end - `endif - - ////////////////////////////////////////////////////////////////////////// // Two RAM / regfiles to store the cache blocks ////////////////////////////////////////////////////////////////////////// - friscv_rambe + assign p1_index = (p1_wen) ? p1_windex : p1_rindex; + assign p2_index = (p2_wen) ? p2_windex : p2_rindex; + + friscv_dprambe #( `ifdef CACHE_SIM_ENV .INIT (1), @@ -194,16 +191,21 @@ module friscv_cache_blocks ) data_ram ( - .aclk (aclk), - .wr_en (wen), - .wr_be (wstrb), - .addr_in (windex), - .data_in (wdata), - .addr_out (rindex), - .data_out (rblock_data) + .aclk (aclk), + .p1_wren (p1_wen), + .p1_wbe (p1_wstrb), + .p1_addr (p1_index), + .p1_data_in (p1_wdata), + .p1_data_out (p1_rblock_data), + .p2_wren (p2_wen), + .p2_wbe (p2_wstrb), + .p2_addr (p2_index), + .p2_data_in (p2_wdata), + .p2_data_out (p2_rblock_data) ); - friscv_ram + + friscv_dpram #( `ifdef CACHE_SIM_ENV .INIT (1), @@ -214,15 +216,17 @@ module friscv_cache_blocks ) metadata_ram ( - .aclk (aclk), - .wr_en (wen), - .addr_in (windex), - .data_in ({~flush, wtag}), - .addr_out (rindex), - .data_out ({rblock_set, rblock_tag}) + .aclk (aclk), + .p1_wren (p1_wen), + .p1_addr (p1_index), + .p1_data_in ({~flush, p1_wtag}), + .p1_data_out ({p1_rblock_set, p1_rblock_tag}), + .p2_wren (p2_wen), + .p2_addr (p2_index), + .p2_data_in ({~flush, p2_wtag}), + .p2_data_out ({p2_rblock_set, p2_rblock_tag}) ); - ////////////////////////////////////////////////////////////////////////// // Follow the block fetch, the data selection and the hit/miss generation. // raddr is decomposed in three parts: @@ -237,24 +241,28 @@ module friscv_cache_blocks // // The MSB of a cache block is the set bit, set to 1 once the block // has been written and so valid. This bit is set back to 0 during flush. + // + // Generate hit/miss flags based on read_tag == block_tag: + // - hit indicates the cache line store the expected instruction + // - miss indicates the cache is not initialized or doesn't contain + // the expected address ////////////////////////////////////////////////////////////////////////// // offset is used to select the correct instruction across the cache line assign p1_roffset = p1_raddr[OFFSET_IX+:OFFSET_W]; assign p2_roffset = p1_raddr[OFFSET_IX+:OFFSET_W]; + // address's MSB to identify the memory address source assign p1_rtag = p1_raddr[TAG_IX+:TAG_W]; assign p2_rtag = p2_raddr[TAG_IX+:TAG_W]; + // index is used to parse the cache blocks assign p1_rindex = p1_raddr[INDEX_IX+:INDEX_W]; assign p2_rindex = p2_raddr[INDEX_IX+:INDEX_W]; - // read the corresponding cache line - assign rindex = (p1_ren) ? p1_rindex : p2_rindex; - always @ (posedge aclk or negedge aresetn) begin - if (~aresetn) begin + if (!aresetn) begin p1_hit <= 1'b0; p1_miss <= 1'b0; p1_rdata <= {WLEN{1'b0}}; @@ -269,26 +277,27 @@ module friscv_cache_blocks p2_miss <= 1'b0; p2_rdata <= {WLEN{1'b0}}; end else begin - // - hit indicates the cache line store the expected instruction - // - miss indicates the cache is not initialized or doesn't contain - // the expected instruction address + if (p1_ren) begin - p1_hit <= (rblock_set && p1_rtag==rblock_tag) ? 1'b1 : 1'b0; - p1_miss <= (~rblock_set || p1_rtag!=rblock_tag) ? 1'b1 : 1'b0; - p1_rdata <= rblock_data[p1_roffset*WLEN+:WLEN]; + p1_hit <= (p1_rblock_set && p1_rtag==p1_rblock_tag); + p1_miss <= (~p1_rblock_set || p1_rtag!=p1_rblock_tag); + p1_rdata <= p1_rblock_data[p1_roffset*WLEN+:WLEN]; `ifdef TRACE_BLOCKS `trace_read(1) `endif - end else if (p2_ren) begin - p2_hit <= (rblock_set && p2_rtag==rblock_tag) ? 1'b1 : 1'b0; - p2_miss <= (~rblock_set || p2_rtag!=rblock_tag) ? 1'b1 : 1'b0; - p2_rdata <= rblock_data[p2_roffset*WLEN+:WLEN]; + end else begin + p1_hit <= 1'b0; + p1_miss <= 1'b0; + end + + if (p2_ren) begin + p2_hit <= (p2_rblock_set && p2_rtag==p2_rblock_tag); + p2_miss <= (~p2_rblock_set || p2_rtag!=p2_rblock_tag); + p2_rdata <= p1_rblock_data[p2_roffset*WLEN+:WLEN]; `ifdef TRACE_BLOCKS `trace_read(2) `endif end else begin - p1_hit <= 1'b0; - p1_miss <= 1'b0; p2_hit <= 1'b0; p2_miss <= 1'b0; end diff --git a/rtl/friscv_cache_ooo_mgt.sv b/rtl/friscv_cache_ooo_mgt.sv index 73ee602..b6555b9 100644 --- a/rtl/friscv_cache_ooo_mgt.sv +++ b/rtl/friscv_cache_ooo_mgt.sv @@ -11,15 +11,15 @@ // // Tracks the read request and manage the read completion reordering. Two inputs // from IO fetcher and block fetcher issue some read requests, then block fetcher -// and memory controller complete them, but possibly out-of-order. This module +// and memory controller complete them, but possibly out-of-order. This module // manages the in-order completion. // -// A first stage records the read request (address and ID) and compute the next +// A first stage records the read request (address and ID) and compute the next // available tag. The next tag output will be used by fetchers to substitute the // original tag and manage easier the completion reordering. // // The second stage receives the read data channels and record the completion -// information. +// information. // // The third layer serves the read data channel moving back to the application // and frees the next tag usable. @@ -119,7 +119,7 @@ module friscv_cache_ooo_mgt logic [META_W -1:0] meta_ram[NB_TAG-1:0]; // Stores the completion info which will be driven back to the application logic [CPL_W -1:0] cpl_ram[NB_TAG-1:0]; - // Stores the tags available + // Stores the tags available logic [2*NB_TAG-1:0] tags; // Pointer to current available for next request logic [NB_TAG_W-1:0] req_tag_pt; @@ -152,7 +152,7 @@ module friscv_cache_ooo_mgt //////////////////////////////////////////////////////////////////////////////// assign tag_avlb = (tags[2*req_tag_pt+:2] == FREE); - assign next_tag = req_tag_pt | AXI_ID_MASK; + assign next_tag = {{(AXI_ID_W-NB_TAG_W){1'b0}},req_tag_pt} | AXI_ID_MASK[AXI_ID_W-1:0]; for (genvar i=0; i