diff --git a/Bender.yml b/Bender.yml
index 68872de152..67259fa3e1 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -146,7 +146,7 @@ sources:
       - include_dirs:
         - common/local/util
         files:
-          - common/local/util/sram.sv
+          - common/local/util/sram_pulp.sv
 
       - target: not(all(fpga, xilinx))
         include_dirs:
diff --git a/common/local/util/sram_pulp.sv b/common/local/util/sram_pulp.sv
new file mode 100644
index 0000000000..f3c4773bf4
--- /dev/null
+++ b/common/local/util/sram_pulp.sv
@@ -0,0 +1,90 @@
+// Copyright 2018 ETH Zurich and University of Bologna.
+// Copyright and related rights are licensed under the Solderpad Hardware
+// License, Version 0.51 (the "License"); you may not use this file except in
+// compliance with the License.  You may obtain a copy of the License at
+// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+// or agreed to in writing, software, hardware and materials distributed under
+// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+//
+// Author: Florian Zaruba    <zarubaf@iis.ee.ethz.ch>, ETH Zurich
+//         Michael Schaffner <schaffner@iis.ee.ethz.ch>, ETH Zurich
+//         Nils Wistoff      <nwistoff@iis.ee.ethz.ch>, ETH Zurich
+// Date: 15.08.2018
+// Description: generic tc_sram wrapper for CVA6
+//
+// Note: the wrapped module contains two different implementations for
+// ALTERA and XILINX tools, since these follow different coding styles for
+// inferrable RAMS with byte enable. define `FPGA_TARGET_XILINX or
+// `FPGA_TARGET_ALTERA in your build environment (default is ALTERA)
+
+module sram #(
+    parameter DATA_WIDTH = 64,
+    parameter BYTE_WIDTH = 8,
+    parameter USER_WIDTH = 1,
+    parameter USER_EN    = 0,
+    parameter NUM_WORDS  = 1024,
+    parameter SIM_INIT   = "none",
+    parameter OUT_REGS   = 0     // enables output registers in FPGA macro (read lat = 2)
+)(
+   input  logic                          clk_i,
+   input  logic                          rst_ni,
+   input  logic                          req_i,
+   input  logic                          we_i,
+   input  logic [$clog2(NUM_WORDS)-1:0]  addr_i,
+   input  logic [USER_WIDTH-1:0]         wuser_i,
+   input  logic [DATA_WIDTH-1:0]         wdata_i,
+   input  logic [(DATA_WIDTH+BYTE_WIDTH-1)/BYTE_WIDTH-1:0] be_i,
+   output logic [USER_WIDTH-1:0]         ruser_o,
+   output logic [DATA_WIDTH-1:0]         rdata_o
+);
+
+  tc_sram #(
+    .NumWords    ( NUM_WORDS  ),
+    .DataWidth   ( DATA_WIDTH ),
+    .ByteWidth   ( BYTE_WIDTH ),
+    .NumPorts    ( 32'd1      ),
+    .Latency     ( 32'd1      ),
+    .SimInit     ( SIM_INIT   ),
+    .PrintSimCfg ( 1'b0       )
+  ) i_tc_sram (
+    .clk_i   ( clk_i   ),
+    .rst_ni  ( rst_ni  ),
+    .req_i   ( req_i   ),
+    .we_i    ( we_i    ),
+    .be_i    ( be_i    ),
+    .wdata_i ( wdata_i ),
+    .addr_i  ( addr_i  ),
+    .rdata_o ( rdata_o )
+  );
+
+  if (USER_EN > 0) begin : gen_mem_user
+    tc_sram #(
+      .NumWords    ( NUM_WORDS  ),
+      .DataWidth   ( DATA_WIDTH ),
+      .ByteWidth   ( BYTE_WIDTH ),
+      .NumPorts    ( 32'd1      ),
+      .Latency     ( 32'd1      ),
+      .SimInit     ( SIM_INIT   ),
+      .PrintSimCfg ( 1'b0       )
+    ) i_tc_sram_user (
+      .clk_i   ( clk_i   ),
+      .rst_ni  ( rst_ni  ),
+      .req_i   ( req_i   ),
+      .we_i    ( we_i    ),
+      .be_i    ( be_i    ),
+      .wdata_i ( wuser_i ),
+      .addr_i  ( addr_i  ),
+      .rdata_o ( ruser_o )
+    );
+
+    if (USER_WIDTH != DATA_WIDTH) begin : gen_err_data_user_width
+      $fatal(1, "sram_pulp: USER_WIDTH needs to be equal to DATA_WIDTH (if USER_EN is set).");
+    end
+
+  end else begin
+    assign ruser_o = '0;
+  end
+
+endmodule : sram
diff --git a/core/Flist.cva6 b/core/Flist.cva6
index cde9326997..cc8332393a 100644
--- a/core/Flist.cva6
+++ b/core/Flist.cva6
@@ -186,7 +186,7 @@ ${CVA6_REPO_DIR}/common/local/util/instr_tracer.sv
 ${CVA6_REPO_DIR}/common/local/util/tc_sram_wrapper.sv
 ${CVA6_REPO_DIR}/common/local/util/tc_sram_wrapper_cache_techno.sv
 ${CVA6_REPO_DIR}/vendor/pulp-platform/tech_cells_generic/src/rtl/tc_sram.sv
-${CVA6_REPO_DIR}/common/local/util/sram.sv
+${CVA6_REPO_DIR}/common/local/util/sram_pulp.sv
 ${CVA6_REPO_DIR}/common/local/util/sram_cache.sv
 
 // MMU 
diff --git a/core/cache_subsystem/cache_ctrl.sv b/core/cache_subsystem/cache_ctrl.sv
index c14aeae0f4..9e1dbcfc49 100644
--- a/core/cache_subsystem/cache_ctrl.sv
+++ b/core/cache_subsystem/cache_ctrl.sv
@@ -319,6 +319,9 @@ module cache_ctrl
 
           // set the correct byte enable
           be_o.data[cl_offset>>3+:CVA6Cfg.XLEN/8] = mem_req_q.be;
+          for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin
+            if (hit_way_q[i]) be_o.vldrty[i] = '{valid: 1, dirty: be_o.data};
+          end
           data_o.data[cl_offset+:CVA6Cfg.XLEN]    = mem_req_q.wdata;
           data_o.tag                              = mem_req_d.tag;
           // ~> change the state
diff --git a/core/cache_subsystem/miss_handler.sv b/core/cache_subsystem/miss_handler.sv
index f00ed8d5a1..8b159b4351 100644
--- a/core/cache_subsystem/miss_handler.sv
+++ b/core/cache_subsystem/miss_handler.sv
@@ -324,8 +324,11 @@ module miss_handler
           addr_o = mshr_q.addr[CVA6Cfg.DCACHE_INDEX_WIDTH-1:0];
           req_o = evict_way_q;
           we_o = 1'b1;
-          be_o = '1;
-          be_o.vldrty = evict_way_q;
+          be_o.tag  = '1;
+          be_o.data = '1;
+          for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin
+            if (evict_way_q[i]) be_o.vldrty[i] = '1;
+          end
           data_o.tag   = mshr_q.addr[CVA6Cfg.DCACHE_TAG_WIDTH+CVA6Cfg.DCACHE_INDEX_WIDTH-1:CVA6Cfg.DCACHE_INDEX_WIDTH];
           data_o.data = data_miss_fsm;
           data_o.valid = 1'b1;
@@ -372,9 +375,11 @@ module miss_handler
           we_o         = 1'b1;
           data_o.valid = INVALIDATE_ON_FLUSH ? 1'b0 : 1'b1;
           // invalidate
-          be_o.vldrty  = evict_way_q;
+          for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin
+            if (evict_way_q[i]) be_o.vldrty[i] = '1;
+          end
           // go back to handling the miss or flushing, depending on where we came from
-          state_d      = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS;
+          state_d = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS;
         end
       end
 
diff --git a/core/cache_subsystem/std_nbdcache.sv b/core/cache_subsystem/std_nbdcache.sv
index e81b50d9f0..caefcfea0b 100644
--- a/core/cache_subsystem/std_nbdcache.sv
+++ b/core/cache_subsystem/std_nbdcache.sv
@@ -244,30 +244,20 @@ module std_nbdcache
   // Valid/Dirty Regs
   // ----------------
 
-  // align each valid/dirty bit pair to a byte boundary in order to leverage byte enable signals.
-  // note: if you have an SRAM that supports flat bit enables for your target technology,
-  // you can use it here to save the extra 17x overhead introduced by this workaround.
-  logic [(DCACHE_LINE_WIDTH+8)*DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata;
+  vldrty_t [CVA6Cfg.DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata;
 
   for (genvar i = 0; i < CVA6Cfg.DCACHE_SET_ASSOC; i++) begin
-    for (genvar j = 0; j < DCACHE_LINE_WIDTH / 8; j++) begin
-      // dirty bits assignment
-      assign dirty_wdata[(DCACHE_LINE_WIDTH+8)*i+8*j] = wdata_ram.dirty[j];
-      assign rdata_ram[i].dirty[j]                    = dirty_rdata[(DCACHE_LINE_WIDTH+8)*i+8*j];
-    end
-    // valid bit assignment
-    assign dirty_wdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i] = wdata_ram.valid;
-    assign rdata_ram[i].valid = dirty_rdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i];
-  end
-
-  // be construction for valid_dirty_sram
-  for (genvar i = 0; i < CVA6Cfg.DCACHE_SET_ASSOC; i++) begin
-    assign be_valid_dirty_ram[i*(DCACHE_LINE_WIDTH/8+1)+:(DCACHE_LINE_WIDTH/8+1)] = {be_ram.vldrty[i], be_ram.data} & {(DCACHE_LINE_WIDTH/8+1){be_ram.vldrty[i]}};
+    assign dirty_wdata[i]              = '{dirty: wdata_ram.dirty, valid: wdata_ram.valid};
+    assign rdata_ram[i].dirty          = dirty_rdata[i].dirty;
+    assign rdata_ram[i].valid          = dirty_rdata[i].valid;
+    assign be_valid_dirty_ram[i].valid = be_ram.vldrty[i].valid;
+    assign be_valid_dirty_ram[i].dirty = be_ram.vldrty[i].dirty;
   end
 
   sram #(
       .USER_WIDTH(1),
-      .DATA_WIDTH((DCACHE_LINE_WIDTH + 8) * DCACHE_SET_ASSOC),
+      .DATA_WIDTH(DCACHE_SET_ASSOC * $bits(vldrty_t)),
+      .BYTE_WIDTH(1),
       .NUM_WORDS (CVA6Cfg.DCACHE_NUM_WORDS)
   ) valid_dirty_sram (
       .clk_i  (clk_i),
diff --git a/corev_apu/tb/ariane_tb.cpp b/corev_apu/tb/ariane_tb.cpp
index 8de0c5b20f..d7b4d2dcb2 100644
--- a/corev_apu/tb/ariane_tb.cpp
+++ b/corev_apu/tb/ariane_tb.cpp
@@ -332,11 +332,11 @@ int main(int argc, char **argv) {
   // Preload memory.
 #if (VERILATOR_VERSION_INTEGER >= 5000000)
   // Verilator v5: Use rootp pointer and .data() accessor.
-#define MEM top->rootp->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__i_tc_sram_wrapper__DOT__i_tc_sram__DOT__sram.m_storage
+#define MEM top->rootp->ariane_testharness__DOT__i_sram__DOT__i_tc_sram__DOT__sram.m_storage
 #define MEM_USER top->rootp->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__gen_mem_user__DOT__i_tc_sram_wrapper_user__DOT__i_tc_sram__DOT__sram.m_storage
 #else
   // Verilator v4
-#define MEM top->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__i_tc_sram_wrapper__DOT__i_tc_sram__DOT__sram
+#define MEM top->ariane_testharness__DOT__i_sram__DOT__i_tc_sram__DOT__sram
 #define MEM_USER top->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__gen_mem_user__DOT__i_tc_sram_wrapper_user__DOT__i_tc_sram__DOT__sram
 #endif
   long long addr;
diff --git a/corev_apu/tb/ariane_tb.sv b/corev_apu/tb/ariane_tb.sv
index 22e3276512..98f95b4f68 100644
--- a/corev_apu/tb/ariane_tb.sv
+++ b/corev_apu/tb/ariane_tb.sv
@@ -20,7 +20,7 @@ import uvm_pkg::*;
 `include "uvm_macros.svh"
 `include "rvfi_types.svh"
 
-`define MAIN_MEM(P) dut.i_sram.gen_cut[0].i_tc_sram_wrapper.i_tc_sram.init_val[(``P``)]
+`define MAIN_MEM(P) dut.i_sram.i_tc_sram.init_val[(``P``)]
 `define USER_MEM(P) dut.i_sram.gen_cut[0].gen_mem_user.i_tc_sram_wrapper_user.i_tc_sram.init_val[(``P``)]
 
 `ifndef READ_ELF_T