perf: use perfUtils in Utility (#179)

OpenXiangShan · Jul 11, 2024 · f9dffb2 · f9dffb2
1 parent 6e15b5b
commit f9dffb2
Show file tree

Hide file tree

Showing 12 changed files with 119 additions and 182 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -18,7 +18,7 @@ jobs:
   # This workflow contains a single job called "build"
   tl-test_L2:
     # The type of runner that the job will run on
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
@@ -35,7 +35,7 @@ jobs:
         uses: coursier/cache-action@v5
 
       - name: Verilator
-        run: sudo apt install verilator
+        run: sudo apt install verilator libsqlite3-dev
 
       - name: Setup Mill
         uses: jodersky/[email protected]
@@ -60,7 +60,7 @@ jobs:
 
   tl-test_L2L3:
     # The type of runner that the job will run on
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
@@ -77,7 +77,7 @@ jobs:
         uses: coursier/cache-action@v5
 
       - name: Verilator
-        run: sudo apt install verilator
+        run: sudo apt install verilator libsqlite3-dev
 
       - name: Setup Mill
         uses: jodersky/[email protected]

diff --git a/Utility b/Utility
diff --git a/rocket-chip b/rocket-chip
diff --git a/src/main/scala/huancun/DataStorage.scala b/src/main/scala/huancun/DataStorage.scala
@@ -22,7 +22,7 @@ package huancun
 import org.chipsalliance.cde.config.Parameters
 import chisel3._
 import chisel3.util._
-import huancun.utils.{SRAMWrapper, XSPerfAccumulate}
+import huancun.utils.SRAMWrapper
 import utility._
 
 class DataStorage(implicit p: Parameters) extends HuanCunModule {
@@ -264,7 +264,7 @@ class DataStorage(implicit p: Parameters) extends HuanCunModule {
   val debug_stack_used = PopCount(bank_en.grouped(stackSize).toList.map(seq => Cat(seq).orR))
 
   for (i <- 1 to nrStacks) {
-    XSPerfAccumulate(cacheParams, s"DS_${i}_stacks_used", debug_stack_used === i.U)
+    XSPerfAccumulate(s"DS_${i}_stacks_used", debug_stack_used === i.U)
   }
 
 }

diff --git a/src/main/scala/huancun/HuanCun.scala b/src/main/scala/huancun/HuanCun.scala
@@ -28,7 +28,7 @@ import freechips.rocketchip.tilelink._
 import freechips.rocketchip.tilelink.TLMessages._
 import freechips.rocketchip.util.{BundleField, BundleFieldBase, UIntToOH1}
 import huancun.prefetch._
-import utils.{ResetGen, XSPerfAccumulate}
+import utils.ResetGen
 import utility.{Pipeline, FastArbiter}
 import huancun.noninclusive.MSHR
 

diff --git a/src/main/scala/huancun/MSHRAlloc.scala b/src/main/scala/huancun/MSHRAlloc.scala
@@ -23,7 +23,7 @@ import org.chipsalliance.cde.config.Parameters
 import chisel3._
 import chisel3.util._
 import huancun.utils._
-import utility.{ParallelOR, ParallelPriorityMux}
+import utility._
 import freechips.rocketchip.tilelink._
 
 class MSHRSelector(implicit p: Parameters) extends HuanCunModule {
@@ -192,9 +192,9 @@ class MSHRAlloc(implicit p: Parameters) extends HuanCunModule {
       }
       val cntEnable =
         !io.status(i).valid && cnt =/= 0.U && cntStart && cnt < 5000.U // Ignore huge cnt during L3 dir reset
-      XSPerfHistogram(cacheParams, "mshr_latency_" + Integer.toString(i, 10), cnt, cntEnable, 0, 300, 10, rStrict = true)
-      XSPerfHistogram(cacheParams, "mshr_latency_" + Integer.toString(i, 10), cnt, cntEnable, 300, 1000, 50, lStrict = true)
-      XSPerfMax(cacheParams, "mshr_latency", cnt, cntEnable)
+      XSPerfHistogram("mshr_latency_" + Integer.toString(i, 10), cnt, cntEnable, 0, 300, 10, right_strict = true)
+      XSPerfHistogram("mshr_latency_" + Integer.toString(i, 10), cnt, cntEnable, 300, 1000, 50, right_strict = true)
+      XSPerfMax("mshr_latency", cnt, cntEnable)
     }
   }
 
@@ -203,13 +203,13 @@ class MSHRAlloc(implicit p: Parameters) extends HuanCunModule {
       (s.bits.set(block_granularity - 1, 0) === io.a_req.bits.set(block_granularity - 1, 0))
   ))
 
-  XSPerfAccumulate(cacheParams, "nrWorkingABCmshr", PopCount(io.status.init.init.map(_.valid)))
-  XSPerfAccumulate(cacheParams, "nrWorkingBmshr", io.status.take(mshrs+1).last.valid)
-  XSPerfAccumulate(cacheParams, "nrWorkingCmshr", io.status.last.valid)
-  XSPerfAccumulate(cacheParams, "conflictA", io.a_req.valid && conflict_a)
-  XSPerfAccumulate(cacheParams, "conflictByPrefetch", io.a_req.valid && Cat(pretch_block_vec).orR)
-  XSPerfAccumulate(cacheParams, "conflictB", io.b_req.valid && conflict_b)
-  XSPerfAccumulate(cacheParams, "conflictC", io.c_req.valid && conflict_c)
+  XSPerfAccumulate("nrWorkingABCmshr", PopCount(io.status.init.init.map(_.valid)))
+  XSPerfAccumulate("nrWorkingBmshr", io.status.take(mshrs+1).last.valid)
+  XSPerfAccumulate("nrWorkingCmshr", io.status.last.valid)
+  XSPerfAccumulate("conflictA", io.a_req.valid && conflict_a)
+  XSPerfAccumulate("conflictByPrefetch", io.a_req.valid && Cat(pretch_block_vec).orR)
+  XSPerfAccumulate("conflictB", io.b_req.valid && conflict_b)
+  XSPerfAccumulate("conflictC", io.c_req.valid && conflict_c)
   //val perfinfo = IO(new Bundle(){
   //  val perfEvents = Output(new PerfEventsBundle(numPCntHcMSHR))
   //})

diff --git a/src/main/scala/huancun/RequestBuffer.scala b/src/main/scala/huancun/RequestBuffer.scala
@@ -3,8 +3,7 @@ package huancun
 import org.chipsalliance.cde.config.Parameters
 import chisel3._
 import chisel3.util._
-import huancun.utils.XSPerfAccumulate
-import utility.FastArbiter
+import utility.{FastArbiter, XSPerfAccumulate}
 
 class RequestBuffer(flow: Boolean = true, entries: Int = 16)(implicit p: Parameters) extends HuanCunModule {
 
@@ -92,18 +91,18 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 16)(implicit p: Paramet
     }
   }
 
-  XSPerfAccumulate(cacheParams, "req_buffer_merge", dup && !full)
+  XSPerfAccumulate("req_buffer_merge", dup && !full)
   if(flow){
-    XSPerfAccumulate(cacheParams, "req_buffer_flow", no_ready_entry && io.in.fire)
+    XSPerfAccumulate("req_buffer_flow", no_ready_entry && io.in.fire)
   }
-  XSPerfAccumulate(cacheParams, "req_buffer_alloc", alloc)
-  XSPerfAccumulate(cacheParams, "req_buffer_full", full)
+  XSPerfAccumulate("req_buffer_alloc", alloc)
+  XSPerfAccumulate("req_buffer_full", full)
   for(i <- 0 until entries){
     val update = PopCount(valids) === i.U
-    XSPerfAccumulate(cacheParams, s"req_buffer_util_$i", update)
+    XSPerfAccumulate(s"req_buffer_util_$i", update)
   }
-  XSPerfAccumulate(cacheParams, "recv_prefetch", io.in.fire && io.in.bits.isPrefetch.getOrElse(false.B))
-  XSPerfAccumulate(cacheParams, "recv_normal", io.in.fire && !io.in.bits.isPrefetch.getOrElse(false.B))
+  XSPerfAccumulate("recv_prefetch", io.in.fire && io.in.bits.isPrefetch.getOrElse(false.B))
+  XSPerfAccumulate("recv_normal", io.in.fire && !io.in.bits.isPrefetch.getOrElse(false.B))
   val perfinfo = IO(Output(Vec(numPCntHcReqb, (UInt(6.W)))))
   val perfEvents = Seq(
     ("req_buffer_merge          ", dup && !full                                             ),

diff --git a/src/main/scala/huancun/TopDownMonitor.scala b/src/main/scala/huancun/TopDownMonitor.scala
@@ -4,8 +4,7 @@ import org.chipsalliance.cde.config.Parameters
 import chisel3._
 import chisel3.util._
 import huancun.noninclusive.DirResult
-import huancun.utils.{XSPerfAccumulate, XSPerfHistogram}
-import utility.MemReqSource
+import utility.{MemReqSource, XSPerfAccumulate, XSPerfHistogram}
 
 class TopDownMonitor()(implicit p: Parameters) extends HuanCunModule {
   val banks = 1 << bankBits
@@ -36,7 +35,7 @@ class TopDownMonitor()(implicit p: Parameters) extends HuanCunModule {
     }
 
     addrMatch := Cat(addrMatchVec.flatten).orR
-    XSPerfAccumulate(cacheParams, s"${cacheParams.name}MissMatch_${hartId}", addrMatch)
+    XSPerfAccumulate(s"${cacheParams.name}MissMatch_${hartId}", addrMatch)
   }
 
   /* ====== PART TWO ======
@@ -56,16 +55,16 @@ class TopDownMonitor()(implicit p: Parameters) extends HuanCunModule {
   // val missVecAll      = allMSHRMatchVec(s => s.fromA && s.is_miss)
 
   val totalMSHRs = banks * mshrsAll
-  XSPerfHistogram(cacheParams, "parallel_misses_CPU" , PopCount(missVecCPU), true.B, 0, totalMSHRs, 1)
-  XSPerfHistogram(cacheParams, "parallel_misses_Pref", PopCount(missVecPref), true.B, 0, totalMSHRs, 1)
-  XSPerfHistogram(cacheParams, "parallel_misses_All" , PopCount(missVecCPU)+PopCount(missVecPref), true.B, 0, 32, 1)
+  XSPerfHistogram("parallel_misses_CPU" , PopCount(missVecCPU), true.B, 0, totalMSHRs, 1)
+  XSPerfHistogram("parallel_misses_Pref", PopCount(missVecPref), true.B, 0, totalMSHRs, 1)
+  XSPerfHistogram("parallel_misses_All" , PopCount(missVecCPU)+PopCount(missVecPref), true.B, 0, 32, 1)
 
   /* ====== PART THREE ======
  * Distinguish req sources and count num & miss
  */
   // count releases
   val releaseCnt = allMSHRMatchVec(s => s.will_free && s.fromC)
-  XSPerfAccumulate(cacheParams, s"${cacheParams.name}C_ReleaseCnt_Total", PopCount(releaseCnt))
+  XSPerfAccumulate(s"${cacheParams.name}C_ReleaseCnt_Total", PopCount(releaseCnt))
 
   // we can follow the counting logic of Directory to count
   // add reqSource in replacerInfo, set in MSHRAlloc, passes in Directory and get the result in DirResult
@@ -81,7 +80,7 @@ class TopDownMonitor()(implicit p: Parameters) extends HuanCunModule {
     val sourceMatchVecMiss = dirResultMatchVec(r => r.replacerInfo.reqSource === i.U && !r.self.hit)
 
     val sourceName = MemReqSource.apply(i).toString
-    XSPerfAccumulate(cacheParams, s"E2_${cacheParams.name}AReqSource_${sourceName}_Total", PopCount(sourceMatchVec))
-    XSPerfAccumulate(cacheParams, s"E2_${cacheParams.name}AReqSource_${sourceName}_Miss", PopCount(sourceMatchVecMiss))
+    XSPerfAccumulate(s"E2_${cacheParams.name}AReqSource_${sourceName}_Total", PopCount(sourceMatchVec))
+    XSPerfAccumulate(s"E2_${cacheParams.name}AReqSource_${sourceName}_Miss", PopCount(sourceMatchVecMiss))
   }
 }
diff --git a/src/main/scala/huancun/noninclusive/Directory.scala b/src/main/scala/huancun/noninclusive/Directory.scala
@@ -8,7 +8,7 @@ import huancun.MetaData._
 import huancun._
 import huancun.debug.{DirectoryLogger, TypeId}
 import huancun.utils._
-import utility.{ParallelMax, ParallelPriorityMux}
+import utility.{GTimer, ParallelMax, ParallelPriorityMux, XSPerfAccumulate}
 
 trait HasClientInfo { this: HasHuanCunParameters =>
   // assume all clients have same params
@@ -316,18 +316,18 @@ class Directory(implicit p: Parameters)
 
   assert(dirReadPorts == 1)
   val req_r = RegEnable(req.bits, req.fire)
-  XSPerfAccumulate(cacheParams, "selfdir_A_req", req_r.replacerInfo.channel(0) && resp.valid)
-  XSPerfAccumulate(cacheParams, "selfdir_A_hit", RegNext(req_r.replacerInfo.channel(0) && resp.valid) && resp.bits.self.hit)
-  XSPerfAccumulate(cacheParams, "selfdir_B_req", req_r.replacerInfo.channel(1) && resp.valid)
-  XSPerfAccumulate(cacheParams, "selfdir_B_hit", RegNext(req_r.replacerInfo.channel(1) && resp.valid) && resp.bits.self.hit)
-  XSPerfAccumulate(cacheParams, "selfdir_C_req", req_r.replacerInfo.channel(2) && resp.valid)
-  XSPerfAccumulate(cacheParams, "selfdir_C_hit", RegNext(req_r.replacerInfo.channel(2) && resp.valid) && resp.bits.self.hit)
+  XSPerfAccumulate("selfdir_A_req", req_r.replacerInfo.channel(0) && resp.valid)
+  XSPerfAccumulate("selfdir_A_hit", RegNext(req_r.replacerInfo.channel(0) && resp.valid) && resp.bits.self.hit)
+  XSPerfAccumulate("selfdir_B_req", req_r.replacerInfo.channel(1) && resp.valid)
+  XSPerfAccumulate("selfdir_B_hit", RegNext(req_r.replacerInfo.channel(1) && resp.valid) && resp.bits.self.hit)
+  XSPerfAccumulate("selfdir_C_req", req_r.replacerInfo.channel(2) && resp.valid)
+  XSPerfAccumulate("selfdir_C_hit", RegNext(req_r.replacerInfo.channel(2) && resp.valid) && resp.bits.self.hit)
 
-  XSPerfAccumulate(cacheParams, "selfdir_dirty", RegNext(resp.valid) && resp.bits.self.dirty)
-  XSPerfAccumulate(cacheParams, "selfdir_TIP", RegNext(resp.valid) && resp.bits.self.state === TIP)
-  XSPerfAccumulate(cacheParams, "selfdir_BRANCH", RegNext(resp.valid) && resp.bits.self.state === BRANCH)
-  XSPerfAccumulate(cacheParams, "selfdir_TRUNK", RegNext(resp.valid) && resp.bits.self.state === TRUNK)
-  XSPerfAccumulate(cacheParams, "selfdir_INVALID", RegNext(resp.valid) && resp.bits.self.state === INVALID)
+  XSPerfAccumulate("selfdir_dirty", RegNext(resp.valid) && resp.bits.self.dirty)
+  XSPerfAccumulate("selfdir_TIP", RegNext(resp.valid) && resp.bits.self.state === TIP)
+  XSPerfAccumulate("selfdir_BRANCH", RegNext(resp.valid) && resp.bits.self.state === BRANCH)
+  XSPerfAccumulate("selfdir_TRUNK", RegNext(resp.valid) && resp.bits.self.state === TRUNK)
+  XSPerfAccumulate("selfdir_INVALID", RegNext(resp.valid) && resp.bits.self.state === INVALID)
   //val perfinfo = IO(new Bundle(){
   //  val perfEvents = Output(new PerfEventsBundle(numPCntHcDir))
   //})

diff --git a/src/main/scala/huancun/noninclusive/ProbeHelper.scala b/src/main/scala/huancun/noninclusive/ProbeHelper.scala
@@ -5,8 +5,7 @@ import chisel3._
 import chisel3.util._
 import freechips.rocketchip.tilelink.{TLMessages, TLPermissions}
 import huancun.{HuanCunModule, MSHRRequest, MetaData}
-import huancun.utils.XSPerfAccumulate
-import utility.MemReqSource
+import utility.{MemReqSource, XSPerfAccumulate}
 
 class ProbeHelper(entries: Int = 5, enqDelay: Int = 1)(implicit p: Parameters)
   extends HuanCunModule with HasClientInfo
@@ -65,7 +64,7 @@ class ProbeHelper(entries: Int = 5, enqDelay: Int = 1)(implicit p: Parameters)
 
   io.probe <> queue.io.deq
 
-  XSPerfAccumulate(cacheParams, "client_dir_conflict", queue.io.enq.fire)
+  XSPerfAccumulate("client_dir_conflict", queue.io.enq.fire)
   //val perfinfo = IO(new Bundle(){
   //  val perfEvents = Output(new PerfEventsBundle(numPCntHcReqb))
   //})

diff --git a/src/main/scala/huancun/utils/XSPerfAccumulate.scala b/src/main/scala/huancun/utils/XSPerfAccumulate.scala