From 03202413a679354e5e43bd3f77f43aa51f2f24f5 Mon Sep 17 00:00:00 2001 From: pengxiao Date: Sat, 14 Sep 2024 15:31:26 +0800 Subject: [PATCH] Bpu: Optimize CGE of bpu/predictors_io_update by moving update regs into predictors, except for the update PC --- src/main/scala/xiangshan/frontend/BPU.scala | 9 +++-- src/main/scala/xiangshan/frontend/FTB.scala | 34 +++++++++------- .../scala/xiangshan/frontend/FauFTB.scala | 40 ++++++++++--------- .../scala/xiangshan/frontend/ITTAGE.scala | 24 +++++++++-- src/main/scala/xiangshan/frontend/SC.scala | 4 +- src/main/scala/xiangshan/frontend/Tage.scala | 34 ++++++++++++---- .../scala/xiangshan/frontend/newRAS.scala | 16 +++++--- 7 files changed, 106 insertions(+), 55 deletions(-) diff --git a/src/main/scala/xiangshan/frontend/BPU.scala b/src/main/scala/xiangshan/frontend/BPU.scala index bef4a76f8e..cf50e235f6 100644 --- a/src/main/scala/xiangshan/frontend/BPU.scala +++ b/src/main/scala/xiangshan/frontend/BPU.scala @@ -763,10 +763,11 @@ class Predictor(implicit p: Parameters) extends XSModule with HasBPUConst with H io.bpu_to_ftq.resp.bits.s3.hasRedirect.zip(s3_redirect_dup).map {case (hr, r) => hr := r} io.bpu_to_ftq.resp.bits.s3.ftq_idx := s3_ftq_idx - predictors.io.update.valid := RegNext(io.ftq_to_bpu.update.valid, init = false.B) - predictors.io.update.bits := RegEnable(io.ftq_to_bpu.update.bits, io.ftq_to_bpu.update.valid) - predictors.io.update.bits.ghist := RegEnable( - getHist(io.ftq_to_bpu.update.bits.spec_info.histPtr), io.ftq_to_bpu.update.valid) + + predictors.io.update := io.ftq_to_bpu.update + predictors.io.update.bits.ghist := getHist(io.ftq_to_bpu.update.bits.spec_info.histPtr) + // Move the update pc registers out of predictors. + predictors.io.update.bits.pc := SegmentedAddrNext(io.ftq_to_bpu.update.bits.pc, pcSegments, io.ftq_to_bpu.update.valid, Some("predictors_io_update_pc")).getAddr() val redirect_dup = do_redirect_dup.map(_.bits) predictors.io.redirect := do_redirect_dup(0) diff --git a/src/main/scala/xiangshan/frontend/FTB.scala b/src/main/scala/xiangshan/frontend/FTB.scala index 0afbaed74d..ba46051f18 100644 --- a/src/main/scala/xiangshan/frontend/FTB.scala +++ b/src/main/scala/xiangshan/frontend/FTB.scala @@ -684,10 +684,18 @@ class FTB(implicit p: Parameters) extends BasePredictor with FTBParams with BPUU s0_close_ftb_req := true.B } + val update_valid = RegNext(io.update.valid, init = false.B) + val update = Wire(new BranchPredictionUpdate) + update := RegEnable(io.update.bits, io.update.valid) + val update_pc = io.update.bits.pc // Move the update pc registers out of predictors. + + // To improve Clock Gating Efficiency + update.meta := RegEnable(io.update.bits.meta, io.update.valid && !io.update.bits.old_entry) + //Clear counter during false_hit or ifuRedirect val ftb_false_hit = WireInit(false.B) val needReopen = s0_close_ftb_req && (ftb_false_hit || io.redirectFromIFU) - ftb_false_hit := io.update.valid && io.update.bits.false_hit + ftb_false_hit := update_valid && update.false_hit when(needReopen){ fauftb_ftb_entry_consistent_counter := 0.U s0_close_ftb_req := false.B @@ -751,12 +759,10 @@ class FTB(implicit p: Parameters) extends BasePredictor with FTBParams with BPUU } // Update logic - val update = io.update.bits - val u_meta = update.meta.asTypeOf(new FTBMeta) - val u_valid = io.update.valid && !io.update.bits.old_entry + val u_valid = update_valid && !update.old_entry - val (_, delay2_pc) = DelayNWithValid(update.pc, u_valid, 2) + val (_, delay2_pc) = DelayNWithValid(update_pc, u_valid, 2) val (_, delay2_entry) = DelayNWithValid(update.ftb_entry, u_valid, 2) @@ -766,16 +772,16 @@ class FTB(implicit p: Parameters) extends BasePredictor with FTBParams with BPUU io.s1_ready := ftbBank.io.req_pc.ready && !(update_need_read) && !RegNext(update_need_read) ftbBank.io.u_req_pc.valid := update_need_read - ftbBank.io.u_req_pc.bits := update.pc + ftbBank.io.u_req_pc.bits := update_pc val ftb_write = Wire(new FTBEntryWithTag) ftb_write.entry := Mux(update_now, update.ftb_entry, delay2_entry) - ftb_write.tag := ftbAddr.getTag(Mux(update_now, update.pc, delay2_pc))(tagSize-1, 0) + ftb_write.tag := ftbAddr.getTag(Mux(update_now, update_pc, delay2_pc))(tagSize-1, 0) val write_valid = update_now || DelayN(u_valid && !u_meta.hit, 2) - val write_pc = Mux(update_now, update.pc, delay2_pc) + val write_pc = Mux(update_now, update_pc, delay2_pc) ftbBank.io.update_write_data.valid := write_valid ftbBank.io.update_write_data.bits := ftb_write @@ -801,16 +807,16 @@ class FTB(implicit p: Parameters) extends BasePredictor with FTBParams with BPUU XSPerfAccumulate("ftb_read_hits", RegNext(io.s0_fire(0)) && s1_hit) XSPerfAccumulate("ftb_read_misses", RegNext(io.s0_fire(0)) && !s1_hit) - XSPerfAccumulate("ftb_commit_hits", io.update.valid && u_meta.hit) - XSPerfAccumulate("ftb_commit_misses", io.update.valid && !u_meta.hit) + XSPerfAccumulate("ftb_commit_hits", update_valid && u_meta.hit) + XSPerfAccumulate("ftb_commit_misses", update_valid && !u_meta.hit) - XSPerfAccumulate("ftb_update_req", io.update.valid) - XSPerfAccumulate("ftb_update_ignored", io.update.valid && io.update.bits.old_entry) + XSPerfAccumulate("ftb_update_req", update_valid) + XSPerfAccumulate("ftb_update_ignored", update_valid && update.old_entry) XSPerfAccumulate("ftb_updated", u_valid) override val perfEvents = Seq( - ("ftb_commit_hits ", io.update.valid && u_meta.hit), - ("ftb_commit_misses ", io.update.valid && !u_meta.hit), + ("ftb_commit_hits ", update_valid && u_meta.hit), + ("ftb_commit_misses ", update_valid && !u_meta.hit), ) generatePerfEvent() } diff --git a/src/main/scala/xiangshan/frontend/FauFTB.scala b/src/main/scala/xiangshan/frontend/FauFTB.scala index 505bef8721..1221e8ae46 100644 --- a/src/main/scala/xiangshan/frontend/FauFTB.scala +++ b/src/main/scala/xiangshan/frontend/FauFTB.scala @@ -142,25 +142,27 @@ class FauFTB(implicit p: Parameters) extends BasePredictor with FauFTBParams { // s1: alloc_way and write // s0 - val u = io.update - val u_meta = u.bits.meta.asTypeOf(new FauFTBMeta) - val u_s0_tag = getTag(u.bits.pc) + val u_valid = RegNext(io.update.valid, init = false.B) + val u_bits = RegEnable(io.update.bits, io.update.valid) + val u_pc = io.update.bits.pc // Move the update pc registers out of predictors. + val u_meta = u_bits.meta.asTypeOf(new FauFTBMeta) + val u_s0_tag = getTag(u_pc) ways.foreach(_.io.update_req_tag := u_s0_tag) val u_s0_hit_oh = VecInit(ways.map(_.io.update_hit)).asUInt val u_s0_hit = u_s0_hit_oh.orR val u_s0_br_update_valids = VecInit((0 until numBr).map(w => - u.bits.ftb_entry.brValids(w) && u.valid && !u.bits.ftb_entry.always_taken(w) && - !(PriorityEncoder(u.bits.br_taken_mask) < w.U))) + u_bits.ftb_entry.brValids(w) && u_valid && !u_bits.ftb_entry.always_taken(w) && + !(PriorityEncoder(u_bits.br_taken_mask) < w.U))) // s1 - val u_s1_valid = RegNext(u.valid) - val u_s1_tag = RegEnable(u_s0_tag, u.valid) - val u_s1_hit_oh = RegEnable(u_s0_hit_oh, u.valid) - val u_s1_hit = RegEnable(u_s0_hit, u.valid) + val u_s1_valid = RegNext(u_valid) + val u_s1_tag = RegEnable(u_s0_tag, u_valid) + val u_s1_hit_oh = RegEnable(u_s0_hit_oh, u_valid) + val u_s1_hit = RegEnable(u_s0_hit, u_valid) val u_s1_alloc_way = replacer.way val u_s1_write_way_oh = Mux(u_s1_hit, u_s1_hit_oh, UIntToOH(u_s1_alloc_way)) - val u_s1_ftb_entry = RegEnable(u.bits.ftb_entry, u.valid) + val u_s1_ftb_entry = RegEnable(u_bits.ftb_entry, u_valid) val u_s1_ways_write_valid = VecInit((0 until numWays).map(w => u_s1_write_way_oh(w).asBool && u_s1_valid)) for (w <- 0 until numWays) { ways(w).io.write_valid := u_s1_ways_write_valid(w) @@ -169,15 +171,15 @@ class FauFTB(implicit p: Parameters) extends BasePredictor with FauFTBParams { } // Illegal check for FTB entry writing - val uftb_write_pc = RegEnable(u.bits.pc, u.valid) + val uftb_write_pc = RegEnable(u_pc, u_valid) val uftb_write_fallThrough = u_s1_ftb_entry.getFallThrough(uftb_write_pc) when(u_s1_valid && u_s1_hit){ assert(uftb_write_pc + (FetchWidth * 4).U >= uftb_write_fallThrough, s"FauFTB write entry fallThrough address error!") } // update saturating counters - val u_s1_br_update_valids = RegEnable(u_s0_br_update_valids, u.valid) - val u_s1_br_takens = RegEnable(u.bits.br_taken_mask, u.valid) + val u_s1_br_update_valids = RegEnable(u_s0_br_update_valids, u_valid) + val u_s1_br_takens = RegEnable(u_bits.br_taken_mask, u_valid) for (w <- 0 until numWays) { when (u_s1_ways_write_valid(w)) { for (br <- 0 until numBr) { @@ -201,24 +203,24 @@ class FauFTB(implicit p: Parameters) extends BasePredictor with FauFTBParams { val u_pred_hit_way_map = (0 until numWays).map(w => s0_fire_next_cycle && s1_hit && s1_hit_way === w.U) XSPerfAccumulate("uftb_read_hits", s0_fire_next_cycle && s1_hit) XSPerfAccumulate("uftb_read_misses", s0_fire_next_cycle && !s1_hit) - XSPerfAccumulate("uftb_commit_hits", u.valid && u_meta.hit) - XSPerfAccumulate("uftb_commit_misses", u.valid && !u_meta.hit) - XSPerfAccumulate("uftb_commit_read_hit_pred_miss", u.valid && !u_meta.hit && u_s0_hit_oh.orR) + XSPerfAccumulate("uftb_commit_hits", u_valid && u_meta.hit) + XSPerfAccumulate("uftb_commit_misses", u_valid && !u_meta.hit) + XSPerfAccumulate("uftb_commit_read_hit_pred_miss", u_valid && !u_meta.hit && u_s0_hit_oh.orR) for (w <- 0 until numWays) { XSPerfAccumulate(f"uftb_pred_hit_way_${w}", u_pred_hit_way_map(w)) XSPerfAccumulate(f"uftb_replace_way_${w}", !u_s1_hit && u_s1_alloc_way === w.U) } if(u_meta.pred_way.isDefined) { - val u_commit_hit_way_map = (0 until numWays).map(w => u.valid && u_meta.hit && u_meta.pred_way.get === w.U) + val u_commit_hit_way_map = (0 until numWays).map(w => u_valid && u_meta.hit && u_meta.pred_way.get === w.U) for (w <- 0 until numWays) { XSPerfAccumulate(f"uftb_commit_hit_way_${w}", u_commit_hit_way_map(w)) } } override val perfEvents = Seq( - ("fauftb_commit_hit ", u.valid && u_meta.hit), - ("fauftb_commit_miss ", u.valid && !u_meta.hit), + ("fauftb_commit_hit ", u_valid && u_meta.hit), + ("fauftb_commit_miss ", u_valid && !u_meta.hit), ) generatePerfEvent() diff --git a/src/main/scala/xiangshan/frontend/ITTAGE.scala b/src/main/scala/xiangshan/frontend/ITTAGE.scala index 8852f629e7..3f70932ede 100644 --- a/src/main/scala/xiangshan/frontend/ITTAGE.scala +++ b/src/main/scala/xiangshan/frontend/ITTAGE.scala @@ -412,8 +412,24 @@ class ITTage(implicit p: Parameters) extends BaseITTage { io.out.last_stage_meta := resp_meta.asUInt // Update logic - val u_valid = io.update.valid - val update = io.update.bits + val u_valid = RegNext(io.update.valid, init = false.B) + + val update = Wire(new BranchPredictionUpdate) + update := RegEnable(io.update.bits, io.update.valid) + val update_pc = io.update.bits.pc // Move the update pc registers out of predictors. + + // To improve Clock Gating Efficiency + val u_meta = io.update.bits.meta.asTypeOf(new ITTageMeta) + update.meta.asTypeOf(new ITTageMeta).provider.bits := RegEnable(u_meta.provider.bits , io.update.valid && u_meta.provider.valid ) + update.meta.asTypeOf(new ITTageMeta).providerTarget := RegEnable(u_meta.providerTarget , io.update.valid && u_meta.provider.valid ) + update.meta.asTypeOf(new ITTageMeta).allocate.bits := RegEnable(u_meta.allocate.bits , io.update.valid && u_meta.allocate.valid ) + update.meta.asTypeOf(new ITTageMeta).altProvider.bits := RegEnable(u_meta.altProvider.bits , io.update.valid && u_meta.altProvider.valid) + update.meta.asTypeOf(new ITTageMeta).altProviderTarget := RegEnable(u_meta.altProviderTarget, + io.update.valid && u_meta.provider.valid && u_meta.altProvider.valid && u_meta.providerCtr === 0.U) + update.full_target := RegEnable(io.update.bits.full_target, io.update.valid && (u_meta.provider.valid || io.update.bits.mispred_mask(numBr))) + update.cfi_idx.bits := RegEnable(io.update.bits.cfi_idx.bits, io.update.valid && io.update.bits.cfi_idx.valid) + update.ghist := RegEnable(io.update.bits.ghist, io.update.valid) // TODO: CGE + val updateValid = update.is_jalr && !update.is_ret && u_valid && update.ftb_entry.jmpValid && update.jmp_taken && update.cfi_idx.valid && update.cfi_idx.bits === update.ftb_entry.tailSlot.offset @@ -585,7 +601,7 @@ class ITTage(implicit p: Parameters) extends BaseITTage { tables(i).io.update.uValid := RegEnable(updateUMask(i), false.B, updateMask(i)) tables(i).io.update.u := RegEnable(updateU(i), updateMask(i)) - tables(i).io.update.pc := RegEnable(update.pc, updateMask(i)) + tables(i).io.update.pc := RegEnable(update_pc, updateMask(i)) // use fetch pc instead of instruction pc tables(i).io.update.ghist := RegEnable(update.ghist, updateMask(i)) } @@ -660,7 +676,7 @@ class ITTage(implicit p: Parameters) extends BaseITTage { s2_resps_regs(i).bits.u, s2_resps_regs(i).bits.target) } } - XSDebug(updateValid, p"pc: ${Hexadecimal(update.pc)}, target: ${Hexadecimal(update.full_target)}\n") + XSDebug(updateValid, p"pc: ${Hexadecimal(update_pc)}, target: ${Hexadecimal(update.full_target)}\n") XSDebug(updateValid, updateMeta.toPrintable+p"\n") XSDebug(updateValid, p"correct(${!updateMisPred})\n") diff --git a/src/main/scala/xiangshan/frontend/SC.scala b/src/main/scala/xiangshan/frontend/SC.scala index 477a0a14fe..0d848ebc46 100644 --- a/src/main/scala/xiangshan/frontend/SC.scala +++ b/src/main/scala/xiangshan/frontend/SC.scala @@ -380,8 +380,8 @@ trait HasSC extends HasSCParameter with HasPerfEvents { this: Tage => scTables(i).io.update.tagePreds(b) := RegEnable(scUpdateTagePreds(b), realWen) scTables(i).io.update.takens(b) := RegEnable(scUpdateTakens(b), realWen) scTables(i).io.update.oldCtrs(b) := RegEnable(scUpdateOldCtrs(b)(i), realWen) - scTables(i).io.update.pc := RegEnable(update.pc, realWen) - scTables(i).io.update.ghist := RegEnable(io.update.bits.ghist, realWen) + scTables(i).io.update.pc := RegEnable(update_pc, realWen) + scTables(i).io.update.ghist := RegEnable(update.ghist, realWen) } } diff --git a/src/main/scala/xiangshan/frontend/Tage.scala b/src/main/scala/xiangshan/frontend/Tage.scala index 2c14ce1ce1..de91b75b04 100644 --- a/src/main/scala/xiangshan/frontend/Tage.scala +++ b/src/main/scala/xiangshan/frontend/Tage.scala @@ -607,8 +607,28 @@ class Tage(implicit p: Parameters) extends BaseTage { val resp_s2 = io.out.s2 // Update logic - val u_valid = io.update.valid - val update = io.update.bits + val u_valid = RegNext(io.update.valid, init = false.B) + val update = Wire(new BranchPredictionUpdate) + update := RegEnable(io.update.bits, io.update.valid) + val update_pc = io.update.bits.pc // Move the update pc registers out of predictors. + + // To improve Clock Gating Efficiency + val u_valids_for_cge = VecInit((0 until TageBanks).map(w => io.update.bits.ftb_entry.brValids(w) && io.update.valid)) // io.update.bits.ftb_entry.always_taken has timing issues(FTQEntryGen) + val u_meta = io.update.bits.meta.asTypeOf(new TageMeta) + for(i <- 0 until numBr){ + update.meta.asTypeOf(new TageMeta).providers(i).bits := RegEnable(u_meta.providers(i).bits, u_meta.providers(i).valid && u_valids_for_cge(i)) + update.meta.asTypeOf(new TageMeta).providerResps(i) := RegEnable(u_meta.providerResps(i), u_meta.providers(i).valid && u_valids_for_cge(i)) + update.meta.asTypeOf(new TageMeta).altUsed(i) := RegEnable(u_meta.altUsed(i), u_valids_for_cge(i)) + update.meta.asTypeOf(new TageMeta).allocates(i) := RegEnable(u_meta.allocates(i), io.update.valid && io.update.bits.mispred_mask(i)) + } + if(EnableSC){ + for(w <- 0 until TageBanks){ + update.meta.asTypeOf(new TageMeta).scMeta.get.scPreds(w) := RegEnable(u_meta.scMeta.get.scPreds(w), u_valids_for_cge(w) && u_meta.providers(w).valid) + update.meta.asTypeOf(new TageMeta).scMeta.get.ctrs(w) := RegEnable(u_meta.scMeta.get.ctrs(w), u_valids_for_cge(w) && u_meta.providers(w).valid) + } + } + update.ghist := RegEnable(io.update.bits.ghist, io.update.valid) // TODO: CGE + val updateValids = VecInit((0 until TageBanks).map(w => update.ftb_entry.brValids(w) && u_valid && !update.ftb_entry.always_taken(w) && !(PriorityEncoder(update.br_taken_mask) < w.U))) @@ -714,7 +734,7 @@ class Tage(implicit p: Parameters) extends BaseTage { val updateProviderCorrect = updateProviderResp.ctr(TageCtrBits-1) === updateTaken val updateUseAlt = updateMeta.altUsed(i) val updateAltDiffers = updateMeta.altDiffers(i) - val updateAltIdx = use_alt_idx(update.pc) + val updateAltIdx = use_alt_idx(update_pc) val updateUseAltCtr = Mux1H(UIntToOH(updateAltIdx, NUM_USE_ALT_ON_NA), useAltOnNaCtrs(i)) val updateAltPred = updateMeta.altPreds(i) val updateAltCorrect = updateAltPred === updateTaken @@ -836,13 +856,13 @@ class Tage(implicit p: Parameters) extends BaseTage { tables(i).io.update.uMask(w) := RegEnable(updateUMask(w)(i), realWen) tables(i).io.update.us(w) := RegEnable(updateU(w)(i), realWen) // use fetch pc instead of instruction pc - tables(i).io.update.pc := RegEnable(update.pc, realWen) - tables(i).io.update.ghist := RegEnable(io.update.bits.ghist, realWen) + tables(i).io.update.pc := RegEnable(update_pc, realWen) + tables(i).io.update.ghist := RegEnable(update.ghist, realWen) } } bt.io.update_mask := RegNext(baseupdate) bt.io.update_cnt := RegEnable(updatebcnt, baseupdate.reduce(_ | _)) - bt.io.update_pc := RegEnable(update.pc, baseupdate.reduce(_ | _)) + bt.io.update_pc := RegEnable(update_pc, baseupdate.reduce(_ | _)) bt.io.update_takens := RegEnable(bUpdateTakens, baseupdate.reduce(_ | _)) // all should be ready for req @@ -894,7 +914,7 @@ class Tage(implicit p: Parameters) extends BaseTage { val m = updateMeta // val bri = u.metas(b) XSDebug(updateValids(b), "update(%d): pc=%x, cycle=%d, taken:%b, misPred:%d, bimctr:%d, pvdr(%d):%d, altDiff:%d, pvdrU:%d, pvdrCtr:%d, alloc:%b\n", - b.U, update.pc, 0.U, update.br_taken_mask(b), update.mispred_mask(b), + b.U, update_pc, 0.U, update.br_taken_mask(b), update.mispred_mask(b), 0.U, m.providers(b).valid, m.providers(b).bits, m.altDiffers(b), m.providerResps(b).u, m.providerResps(b).ctr, m.allocates(b) ) diff --git a/src/main/scala/xiangshan/frontend/newRAS.scala b/src/main/scala/xiangshan/frontend/newRAS.scala index 13bfd9bb54..e18a6b5a33 100644 --- a/src/main/scala/xiangshan/frontend/newRAS.scala +++ b/src/main/scala/xiangshan/frontend/newRAS.scala @@ -651,14 +651,20 @@ class RAS(implicit p: Parameters) extends BasePredictor { stack.redirect_meta_NOS := recover_cfi.NOS stack.redirect_callAddr := recover_cfi.pc + Mux(recover_cfi.pd.isRVC, 2.U, 4.U) - val update = io.update.bits - val updateMeta = io.update.bits.meta.asTypeOf(new RASMeta) - val updateValid = io.update.valid + val updateValid = RegNext(io.update.valid, init = false.B) + val update = Wire(new BranchPredictionUpdate) + update := RegEnable(io.update.bits, io.update.valid) + val update_pc = io.update.bits.pc // Move the update pc registers out of predictors. - stack.commit_valid := updateValid + // To improve Clock Gating Efficiency + update.meta := RegEnable(io.update.bits.meta, io.update.valid && (io.update.bits.is_call || io.update.bits.is_ret)) + + val updateMeta = update.meta.asTypeOf(new RASMeta) + + stack.commit_valid := updateValid stack.commit_push_valid := updateValid && update.is_call_taken stack.commit_pop_valid := updateValid && update.is_ret_taken - stack.commit_push_addr := update.ftb_entry.getFallThrough(update.pc) + Mux(update.ftb_entry.last_may_be_rvi_call, 2.U, 0.U) + stack.commit_push_addr := update.ftb_entry.getFallThrough(update_pc) + Mux(update.ftb_entry.last_may_be_rvi_call, 2.U, 0.U) stack.commit_meta_TOSW := updateMeta.TOSW stack.commit_meta_ssp := updateMeta.ssp