Skip to content

Commit

Permalink
coherence: Pack Valid/Dirty SRAM (#36)
Browse files Browse the repository at this point in the history
* sram: Replace by sram_pulp

Does not introduce cuts and allows for parametric byte width

Signed-off-by: Nils Wistoff <[email protected]>

* std_cache: Pack valid/dirty SRAM

Signed-off-by: Nils Wistoff <[email protected]>

* std_cache: Decouple Valid/Dirty WE from Data WE

Signed-off-by: Nils Wistoff <[email protected]>

---------

Signed-off-by: Nils Wistoff <[email protected]>
  • Loading branch information
niwis authored and paulsc96 committed Sep 20, 2024
1 parent 1fd9d53 commit 9ead29b
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 27 deletions.
2 changes: 1 addition & 1 deletion Bender.yml
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ sources:
- include_dirs:
- common/local/util
files:
- common/local/util/sram.sv
- common/local/util/sram_pulp.sv

- target: not(all(fpga, xilinx))
include_dirs:
Expand Down
90 changes: 90 additions & 0 deletions common/local/util/sram_pulp.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright 2018 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
//
// Author: Florian Zaruba <[email protected]>, ETH Zurich
// Michael Schaffner <[email protected]>, ETH Zurich
// Nils Wistoff <[email protected]>, ETH Zurich
// Date: 15.08.2018
// Description: generic tc_sram wrapper for CVA6
//
// Note: the wrapped module contains two different implementations for
// ALTERA and XILINX tools, since these follow different coding styles for
// inferrable RAMS with byte enable. define `FPGA_TARGET_XILINX or
// `FPGA_TARGET_ALTERA in your build environment (default is ALTERA)

module sram #(
parameter DATA_WIDTH = 64,
parameter BYTE_WIDTH = 8,
parameter USER_WIDTH = 1,
parameter USER_EN = 0,
parameter NUM_WORDS = 1024,
parameter SIM_INIT = "none",
parameter OUT_REGS = 0 // enables output registers in FPGA macro (read lat = 2)
)(
input logic clk_i,
input logic rst_ni,
input logic req_i,
input logic we_i,
input logic [$clog2(NUM_WORDS)-1:0] addr_i,
input logic [USER_WIDTH-1:0] wuser_i,
input logic [DATA_WIDTH-1:0] wdata_i,
input logic [(DATA_WIDTH+BYTE_WIDTH-1)/BYTE_WIDTH-1:0] be_i,
output logic [USER_WIDTH-1:0] ruser_o,
output logic [DATA_WIDTH-1:0] rdata_o
);

tc_sram #(
.NumWords ( NUM_WORDS ),
.DataWidth ( DATA_WIDTH ),
.ByteWidth ( BYTE_WIDTH ),
.NumPorts ( 32'd1 ),
.Latency ( 32'd1 ),
.SimInit ( SIM_INIT ),
.PrintSimCfg ( 1'b0 )
) i_tc_sram (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.req_i ( req_i ),
.we_i ( we_i ),
.be_i ( be_i ),
.wdata_i ( wdata_i ),
.addr_i ( addr_i ),
.rdata_o ( rdata_o )
);

if (USER_EN > 0) begin : gen_mem_user
tc_sram #(
.NumWords ( NUM_WORDS ),
.DataWidth ( DATA_WIDTH ),
.ByteWidth ( BYTE_WIDTH ),
.NumPorts ( 32'd1 ),
.Latency ( 32'd1 ),
.SimInit ( SIM_INIT ),
.PrintSimCfg ( 1'b0 )
) i_tc_sram_user (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.req_i ( req_i ),
.we_i ( we_i ),
.be_i ( be_i ),
.wdata_i ( wuser_i ),
.addr_i ( addr_i ),
.rdata_o ( ruser_o )
);

if (USER_WIDTH != DATA_WIDTH) begin : gen_err_data_user_width
$fatal(1, "sram_pulp: USER_WIDTH needs to be equal to DATA_WIDTH (if USER_EN is set).");
end

end else begin
assign ruser_o = '0;
end

endmodule : sram
2 changes: 1 addition & 1 deletion core/Flist.cva6
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ ${CVA6_REPO_DIR}/common/local/util/instr_tracer.sv
${CVA6_REPO_DIR}/common/local/util/tc_sram_wrapper.sv
${CVA6_REPO_DIR}/common/local/util/tc_sram_wrapper_cache_techno.sv
${CVA6_REPO_DIR}/vendor/pulp-platform/tech_cells_generic/src/rtl/tc_sram.sv
${CVA6_REPO_DIR}/common/local/util/sram.sv
${CVA6_REPO_DIR}/common/local/util/sram_pulp.sv
${CVA6_REPO_DIR}/common/local/util/sram_cache.sv

// MMU
Expand Down
3 changes: 3 additions & 0 deletions core/cache_subsystem/cache_ctrl.sv
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,9 @@ module cache_ctrl

// set the correct byte enable
be_o.data[cl_offset>>3+:CVA6Cfg.XLEN/8] = mem_req_q.be;
for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin
if (hit_way_q[i]) be_o.vldrty[i] = '{valid: 1, dirty: be_o.data};
end
data_o.data[cl_offset+:CVA6Cfg.XLEN] = mem_req_q.wdata;
data_o.tag = mem_req_d.tag;
// ~> change the state
Expand Down
13 changes: 9 additions & 4 deletions core/cache_subsystem/miss_handler.sv
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,11 @@ module miss_handler
addr_o = mshr_q.addr[CVA6Cfg.DCACHE_INDEX_WIDTH-1:0];
req_o = evict_way_q;
we_o = 1'b1;
be_o = '1;
be_o.vldrty = evict_way_q;
be_o.tag = '1;
be_o.data = '1;
for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin
if (evict_way_q[i]) be_o.vldrty[i] = '1;
end
data_o.tag = mshr_q.addr[CVA6Cfg.DCACHE_TAG_WIDTH+CVA6Cfg.DCACHE_INDEX_WIDTH-1:CVA6Cfg.DCACHE_INDEX_WIDTH];
data_o.data = data_miss_fsm;
data_o.valid = 1'b1;
Expand Down Expand Up @@ -372,9 +375,11 @@ module miss_handler
we_o = 1'b1;
data_o.valid = INVALIDATE_ON_FLUSH ? 1'b0 : 1'b1;
// invalidate
be_o.vldrty = evict_way_q;
for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin
if (evict_way_q[i]) be_o.vldrty[i] = '1;
end
// go back to handling the miss or flushing, depending on where we came from
state_d = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS;
state_d = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS;
end
end

Expand Down
26 changes: 8 additions & 18 deletions core/cache_subsystem/std_nbdcache.sv
Original file line number Diff line number Diff line change
Expand Up @@ -244,30 +244,20 @@ module std_nbdcache
// Valid/Dirty Regs
// ----------------

// align each valid/dirty bit pair to a byte boundary in order to leverage byte enable signals.
// note: if you have an SRAM that supports flat bit enables for your target technology,
// you can use it here to save the extra 17x overhead introduced by this workaround.
logic [(DCACHE_LINE_WIDTH+8)*DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata;
vldrty_t [CVA6Cfg.DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata;

for (genvar i = 0; i < CVA6Cfg.DCACHE_SET_ASSOC; i++) begin
for (genvar j = 0; j < DCACHE_LINE_WIDTH / 8; j++) begin
// dirty bits assignment
assign dirty_wdata[(DCACHE_LINE_WIDTH+8)*i+8*j] = wdata_ram.dirty[j];
assign rdata_ram[i].dirty[j] = dirty_rdata[(DCACHE_LINE_WIDTH+8)*i+8*j];
end
// valid bit assignment
assign dirty_wdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i] = wdata_ram.valid;
assign rdata_ram[i].valid = dirty_rdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i];
end

// be construction for valid_dirty_sram
for (genvar i = 0; i < CVA6Cfg.DCACHE_SET_ASSOC; i++) begin
assign be_valid_dirty_ram[i*(DCACHE_LINE_WIDTH/8+1)+:(DCACHE_LINE_WIDTH/8+1)] = {be_ram.vldrty[i], be_ram.data} & {(DCACHE_LINE_WIDTH/8+1){be_ram.vldrty[i]}};
assign dirty_wdata[i] = '{dirty: wdata_ram.dirty, valid: wdata_ram.valid};
assign rdata_ram[i].dirty = dirty_rdata[i].dirty;
assign rdata_ram[i].valid = dirty_rdata[i].valid;
assign be_valid_dirty_ram[i].valid = be_ram.vldrty[i].valid;
assign be_valid_dirty_ram[i].dirty = be_ram.vldrty[i].dirty;
end

sram #(
.USER_WIDTH(1),
.DATA_WIDTH((DCACHE_LINE_WIDTH + 8) * DCACHE_SET_ASSOC),
.DATA_WIDTH(DCACHE_SET_ASSOC * $bits(vldrty_t)),
.BYTE_WIDTH(1),
.NUM_WORDS (CVA6Cfg.DCACHE_NUM_WORDS)
) valid_dirty_sram (
.clk_i (clk_i),
Expand Down
4 changes: 2 additions & 2 deletions corev_apu/tb/ariane_tb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,11 +332,11 @@ int main(int argc, char **argv) {
// Preload memory.
#if (VERILATOR_VERSION_INTEGER >= 5000000)
// Verilator v5: Use rootp pointer and .data() accessor.
#define MEM top->rootp->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__i_tc_sram_wrapper__DOT__i_tc_sram__DOT__sram.m_storage
#define MEM top->rootp->ariane_testharness__DOT__i_sram__DOT__i_tc_sram__DOT__sram.m_storage
#define MEM_USER top->rootp->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__gen_mem_user__DOT__i_tc_sram_wrapper_user__DOT__i_tc_sram__DOT__sram.m_storage
#else
// Verilator v4
#define MEM top->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__i_tc_sram_wrapper__DOT__i_tc_sram__DOT__sram
#define MEM top->ariane_testharness__DOT__i_sram__DOT__i_tc_sram__DOT__sram
#define MEM_USER top->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__gen_mem_user__DOT__i_tc_sram_wrapper_user__DOT__i_tc_sram__DOT__sram
#endif
long long addr;
Expand Down
2 changes: 1 addition & 1 deletion corev_apu/tb/ariane_tb.sv
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import uvm_pkg::*;
`include "uvm_macros.svh"
`include "rvfi_types.svh"

`define MAIN_MEM(P) dut.i_sram.gen_cut[0].i_tc_sram_wrapper.i_tc_sram.init_val[(``P``)]
`define MAIN_MEM(P) dut.i_sram.i_tc_sram.init_val[(``P``)]
`define USER_MEM(P) dut.i_sram.gen_cut[0].gen_mem_user.i_tc_sram_wrapper_user.i_tc_sram.init_val[(``P``)]

`ifndef READ_ELF_T
Expand Down

0 comments on commit 9ead29b

Please sign in to comment.