From 25c0594484a8fc50837936a814b55fa6d824d39e Mon Sep 17 00:00:00 2001 From: Nils Wistoff Date: Fri, 11 Aug 2023 11:13:39 +0200 Subject: [PATCH] coherence: Pack Valid/Dirty SRAM (#36) * sram: Replace by sram_pulp Does not introduce cuts and allows for parametric byte width Signed-off-by: Nils Wistoff * std_cache: Pack valid/dirty SRAM Signed-off-by: Nils Wistoff * std_cache: Decouple Valid/Dirty WE from Data WE Signed-off-by: Nils Wistoff --------- Signed-off-by: Nils Wistoff --- Bender.yml | 2 +- common/local/util/sram_pulp.sv | 90 ++++++++++++++++++++++++++++ core/Flist.cva6 | 2 +- core/cache_subsystem/cache_ctrl.sv | 3 + core/cache_subsystem/miss_handler.sv | 13 ++-- core/cache_subsystem/std_nbdcache.sv | 26 +++----- corev_apu/tb/ariane_tb.cpp | 4 +- corev_apu/tb/ariane_tb.sv | 2 +- 8 files changed, 115 insertions(+), 27 deletions(-) create mode 100644 common/local/util/sram_pulp.sv diff --git a/Bender.yml b/Bender.yml index bc16f90dce..fab40a0649 100644 --- a/Bender.yml +++ b/Bender.yml @@ -215,7 +215,7 @@ sources: # SRAM Wrappers stay tech-independent; we leave technology mapping to tc_sram - common/local/util/tc_sram_wrapper.sv - common/local/util/tc_sram_wrapper_cache_techno.sv - - common/local/util/sram.sv + - common/local/util/sram_pulp.sv - common/local/util/sram_cache.sv - target: not(synthesis) diff --git a/common/local/util/sram_pulp.sv b/common/local/util/sram_pulp.sv new file mode 100644 index 0000000000..f3c4773bf4 --- /dev/null +++ b/common/local/util/sram_pulp.sv @@ -0,0 +1,90 @@ +// Copyright 2018 ETH Zurich and University of Bologna. +// Copyright and related rights are licensed under the Solderpad Hardware +// License, Version 0.51 (the "License"); you may not use this file except in +// compliance with the License. You may obtain a copy of the License at +// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +// or agreed to in writing, software, hardware and materials distributed under +// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. +// +// Author: Florian Zaruba , ETH Zurich +// Michael Schaffner , ETH Zurich +// Nils Wistoff , ETH Zurich +// Date: 15.08.2018 +// Description: generic tc_sram wrapper for CVA6 +// +// Note: the wrapped module contains two different implementations for +// ALTERA and XILINX tools, since these follow different coding styles for +// inferrable RAMS with byte enable. define `FPGA_TARGET_XILINX or +// `FPGA_TARGET_ALTERA in your build environment (default is ALTERA) + +module sram #( + parameter DATA_WIDTH = 64, + parameter BYTE_WIDTH = 8, + parameter USER_WIDTH = 1, + parameter USER_EN = 0, + parameter NUM_WORDS = 1024, + parameter SIM_INIT = "none", + parameter OUT_REGS = 0 // enables output registers in FPGA macro (read lat = 2) +)( + input logic clk_i, + input logic rst_ni, + input logic req_i, + input logic we_i, + input logic [$clog2(NUM_WORDS)-1:0] addr_i, + input logic [USER_WIDTH-1:0] wuser_i, + input logic [DATA_WIDTH-1:0] wdata_i, + input logic [(DATA_WIDTH+BYTE_WIDTH-1)/BYTE_WIDTH-1:0] be_i, + output logic [USER_WIDTH-1:0] ruser_o, + output logic [DATA_WIDTH-1:0] rdata_o +); + + tc_sram #( + .NumWords ( NUM_WORDS ), + .DataWidth ( DATA_WIDTH ), + .ByteWidth ( BYTE_WIDTH ), + .NumPorts ( 32'd1 ), + .Latency ( 32'd1 ), + .SimInit ( SIM_INIT ), + .PrintSimCfg ( 1'b0 ) + ) i_tc_sram ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .req_i ( req_i ), + .we_i ( we_i ), + .be_i ( be_i ), + .wdata_i ( wdata_i ), + .addr_i ( addr_i ), + .rdata_o ( rdata_o ) + ); + + if (USER_EN > 0) begin : gen_mem_user + tc_sram #( + .NumWords ( NUM_WORDS ), + .DataWidth ( DATA_WIDTH ), + .ByteWidth ( BYTE_WIDTH ), + .NumPorts ( 32'd1 ), + .Latency ( 32'd1 ), + .SimInit ( SIM_INIT ), + .PrintSimCfg ( 1'b0 ) + ) i_tc_sram_user ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .req_i ( req_i ), + .we_i ( we_i ), + .be_i ( be_i ), + .wdata_i ( wuser_i ), + .addr_i ( addr_i ), + .rdata_o ( ruser_o ) + ); + + if (USER_WIDTH != DATA_WIDTH) begin : gen_err_data_user_width + $fatal(1, "sram_pulp: USER_WIDTH needs to be equal to DATA_WIDTH (if USER_EN is set)."); + end + + end else begin + assign ruser_o = '0; + end + +endmodule : sram diff --git a/core/Flist.cva6 b/core/Flist.cva6 index cde9326997..cc8332393a 100644 --- a/core/Flist.cva6 +++ b/core/Flist.cva6 @@ -186,7 +186,7 @@ ${CVA6_REPO_DIR}/common/local/util/instr_tracer.sv ${CVA6_REPO_DIR}/common/local/util/tc_sram_wrapper.sv ${CVA6_REPO_DIR}/common/local/util/tc_sram_wrapper_cache_techno.sv ${CVA6_REPO_DIR}/vendor/pulp-platform/tech_cells_generic/src/rtl/tc_sram.sv -${CVA6_REPO_DIR}/common/local/util/sram.sv +${CVA6_REPO_DIR}/common/local/util/sram_pulp.sv ${CVA6_REPO_DIR}/common/local/util/sram_cache.sv // MMU diff --git a/core/cache_subsystem/cache_ctrl.sv b/core/cache_subsystem/cache_ctrl.sv index c14aeae0f4..cc878954e2 100644 --- a/core/cache_subsystem/cache_ctrl.sv +++ b/core/cache_subsystem/cache_ctrl.sv @@ -319,6 +319,9 @@ module cache_ctrl // set the correct byte enable be_o.data[cl_offset>>3+:CVA6Cfg.XLEN/8] = mem_req_q.be; + for (int unsigned i = 0; i < CVA6Cfg.DCACHE_SET_ASSOC; i++) begin + if (hit_way_q[i]) be_o.vldrty[i] = '{valid: 1, dirty: be_o.data}; + end data_o.data[cl_offset+:CVA6Cfg.XLEN] = mem_req_q.wdata; data_o.tag = mem_req_d.tag; // ~> change the state diff --git a/core/cache_subsystem/miss_handler.sv b/core/cache_subsystem/miss_handler.sv index f00ed8d5a1..f0020ec5ac 100644 --- a/core/cache_subsystem/miss_handler.sv +++ b/core/cache_subsystem/miss_handler.sv @@ -324,8 +324,11 @@ module miss_handler addr_o = mshr_q.addr[CVA6Cfg.DCACHE_INDEX_WIDTH-1:0]; req_o = evict_way_q; we_o = 1'b1; - be_o = '1; - be_o.vldrty = evict_way_q; + be_o.tag = '1; + be_o.data = '1; + for (int unsigned i = 0; i < CVA6Cfg.DCACHE_SET_ASSOC; i++) begin + if (evict_way_q[i]) be_o.vldrty[i] = '1; + end data_o.tag = mshr_q.addr[CVA6Cfg.DCACHE_TAG_WIDTH+CVA6Cfg.DCACHE_INDEX_WIDTH-1:CVA6Cfg.DCACHE_INDEX_WIDTH]; data_o.data = data_miss_fsm; data_o.valid = 1'b1; @@ -372,9 +375,11 @@ module miss_handler we_o = 1'b1; data_o.valid = INVALIDATE_ON_FLUSH ? 1'b0 : 1'b1; // invalidate - be_o.vldrty = evict_way_q; + for (int unsigned i = 0; i < CVA6Cfg.DCACHE_SET_ASSOC; i++) begin + if (evict_way_q[i]) be_o.vldrty[i] = '1; + end // go back to handling the miss or flushing, depending on where we came from - state_d = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS; + state_d = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS; end end diff --git a/core/cache_subsystem/std_nbdcache.sv b/core/cache_subsystem/std_nbdcache.sv index 4f6c123390..18e6c3f213 100644 --- a/core/cache_subsystem/std_nbdcache.sv +++ b/core/cache_subsystem/std_nbdcache.sv @@ -248,30 +248,20 @@ module std_nbdcache // Valid/Dirty Regs // ---------------- - // align each valid/dirty bit pair to a byte boundary in order to leverage byte enable signals. - // note: if you have an SRAM that supports flat bit enables for your target technology, - // you can use it here to save the extra 17x overhead introduced by this workaround. - logic [(CVA6Cfg.DCACHE_LINE_WIDTH+8)*CVA6Cfg.DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata; + vldrty_t [CVA6Cfg.DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata; for (genvar i = 0; i < CVA6Cfg.DCACHE_SET_ASSOC; i++) begin - for (genvar j = 0; j < CVA6Cfg.DCACHE_LINE_WIDTH / 8; j++) begin - // dirty bits assignment - assign dirty_wdata[(CVA6Cfg.DCACHE_LINE_WIDTH+8)*i+8*j] = wdata_ram.dirty[j]; - assign rdata_ram[i].dirty[j] = dirty_rdata[(CVA6Cfg.DCACHE_LINE_WIDTH+8)*i+8*j]; - end - // valid bit assignment - assign dirty_wdata[CVA6Cfg.DCACHE_LINE_WIDTH+(CVA6Cfg.DCACHE_LINE_WIDTH+8)*i] = wdata_ram.valid; - assign rdata_ram[i].valid = dirty_rdata[CVA6Cfg.DCACHE_LINE_WIDTH+(CVA6Cfg.DCACHE_LINE_WIDTH+8)*i]; - end - - // be construction for valid_dirty_sram - for (genvar i = 0; i < CVA6Cfg.DCACHE_SET_ASSOC; i++) begin - assign be_valid_dirty_ram[i*(CVA6Cfg.DCACHE_LINE_WIDTH/8+1)+:(CVA6Cfg.DCACHE_LINE_WIDTH/8+1)] = {be_ram.vldrty[i], be_ram.data} & {(CVA6Cfg.DCACHE_LINE_WIDTH/8+1){be_ram.vldrty[i]}}; + assign dirty_wdata[i] = '{dirty: wdata_ram.dirty, valid: wdata_ram.valid}; + assign rdata_ram[i].dirty = dirty_rdata[i].dirty; + assign rdata_ram[i].valid = dirty_rdata[i].valid; + assign be_valid_dirty_ram[i].valid = be_ram.vldrty[i].valid; + assign be_valid_dirty_ram[i].dirty = be_ram.vldrty[i].dirty; end sram #( .USER_WIDTH(1), - .DATA_WIDTH((CVA6Cfg.DCACHE_LINE_WIDTH + 8) * CVA6Cfg.DCACHE_SET_ASSOC), + .DATA_WIDTH(CVA6Cfg.DCACHE_SET_ASSOC * $bits(vldrty_t)), + .BYTE_WIDTH(1), .NUM_WORDS (CVA6Cfg.DCACHE_NUM_WORDS) ) valid_dirty_sram ( .clk_i (clk_i), diff --git a/corev_apu/tb/ariane_tb.cpp b/corev_apu/tb/ariane_tb.cpp index 8de0c5b20f..d7b4d2dcb2 100644 --- a/corev_apu/tb/ariane_tb.cpp +++ b/corev_apu/tb/ariane_tb.cpp @@ -332,11 +332,11 @@ int main(int argc, char **argv) { // Preload memory. #if (VERILATOR_VERSION_INTEGER >= 5000000) // Verilator v5: Use rootp pointer and .data() accessor. -#define MEM top->rootp->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__i_tc_sram_wrapper__DOT__i_tc_sram__DOT__sram.m_storage +#define MEM top->rootp->ariane_testharness__DOT__i_sram__DOT__i_tc_sram__DOT__sram.m_storage #define MEM_USER top->rootp->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__gen_mem_user__DOT__i_tc_sram_wrapper_user__DOT__i_tc_sram__DOT__sram.m_storage #else // Verilator v4 -#define MEM top->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__i_tc_sram_wrapper__DOT__i_tc_sram__DOT__sram +#define MEM top->ariane_testharness__DOT__i_sram__DOT__i_tc_sram__DOT__sram #define MEM_USER top->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__gen_mem_user__DOT__i_tc_sram_wrapper_user__DOT__i_tc_sram__DOT__sram #endif long long addr; diff --git a/corev_apu/tb/ariane_tb.sv b/corev_apu/tb/ariane_tb.sv index 22e3276512..98f95b4f68 100644 --- a/corev_apu/tb/ariane_tb.sv +++ b/corev_apu/tb/ariane_tb.sv @@ -20,7 +20,7 @@ import uvm_pkg::*; `include "uvm_macros.svh" `include "rvfi_types.svh" -`define MAIN_MEM(P) dut.i_sram.gen_cut[0].i_tc_sram_wrapper.i_tc_sram.init_val[(``P``)] +`define MAIN_MEM(P) dut.i_sram.i_tc_sram.init_val[(``P``)] `define USER_MEM(P) dut.i_sram.gen_cut[0].gen_mem_user.i_tc_sram_wrapper_user.i_tc_sram.init_val[(``P``)] `ifndef READ_ELF_T