diff --git a/Bender.yml b/Bender.yml index b635aa07..8eff817d 100644 --- a/Bender.yml +++ b/Bender.yml @@ -44,6 +44,7 @@ sources: - src/fpnew_fma_multi.sv - src/fpnew_sdotp_multi.sv - src/fpnew_sdotp_multi_wrapper.sv + - src/fpnew_vfshuffle_multi.sv - src/fpnew_noncomp.sv - src/fpnew_opgroup_block.sv - src/fpnew_opgroup_fmt_slice.sv diff --git a/src/fpnew_opgroup_multifmt_slice.sv b/src/fpnew_opgroup_multifmt_slice.sv index f5991cbd..546e4280 100644 --- a/src/fpnew_opgroup_multifmt_slice.sv +++ b/src/fpnew_opgroup_multifmt_slice.sv @@ -176,7 +176,8 @@ or on 16b inputs producing 32b outputs"); for (genvar lane = 0; lane < int'(NUM_LANES); lane++) begin : gen_num_lanes localparam int unsigned LANE = unsigned'(lane); // unsigned to please the linter // Get a mask of active formats for this lane - localparam fpnew_pkg::fmt_logic_t ACTIVE_FORMATS = + localparam fpnew_pkg::fmt_logic_t ACTIVE_FORMATS = (OpGroup == fpnew_pkg::SHFL) ? + fpnew_pkg::get_vfshfl_lane_formats(Width, FpFmtConfig, LANE) : fpnew_pkg::get_lane_formats(Width, FpFmtConfig, LANE); localparam fpnew_pkg::ifmt_logic_t ACTIVE_INT_FORMATS = fpnew_pkg::get_lane_int_formats(Width, FpFmtConfig, IntFmtConfig, LANE); @@ -195,6 +196,11 @@ or on 16b inputs producing 32b outputs"); localparam int unsigned DOTP_MAX_FMT_WIDTH = fpnew_pkg::max_fp_width(DOTP_FORMATS); localparam int unsigned DOTP_WIDTH = fpnew_pkg::minimum(2*DOTP_MAX_FMT_WIDTH, Width); + // Shuffle-specific parameters + localparam fpnew_pkg::fmt_logic_t SHFL_FORMATS = + fpnew_pkg::get_vfshfl_lane_formats(Width, FpFmtConfig, LANE); + localparam int unsigned SHFL_WIDTH = fpnew_pkg::max_fp_width(SHFL_FORMATS); + // Lane parameters from Opgroup localparam fpnew_pkg::fmt_logic_t LANE_FORMATS = (OpGroup == fpnew_pkg::CONV) ? CONV_FORMATS : (OpGroup == fpnew_pkg::DOTP) ? DOTP_FORMATS : @@ -211,6 +217,7 @@ or on 16b inputs producing 32b outputs"); logic [NUM_OPERANDS-1:0][LANE_WIDTH-1:0] local_operands; // lane-local oprands logic [LANE_WIDTH-1:0] op_result; // lane-local results + logic [NUM_OPERANDS-1:0][Width-1:0] operands; fpnew_pkg::status_t op_status; logic lane_is_used; @@ -221,6 +228,7 @@ or on 16b inputs producing 32b outputs"); // Slice out the operands for this lane, upper bits are ignored in the unit always_comb begin : prepare_input for (int unsigned i = 0; i < NUM_OPERANDS; i++) begin + operands[i] = operands_i[i]; local_operands[i] = operands_i[i] >> LANE*fpnew_pkg::fp_width(src_fmt_i); end @@ -248,6 +256,8 @@ or on 16b inputs producing 32b outputs"); local_operands[0] = operands_i[1]; end end + end else if (OpGroup == fpnew_pkg::SHFL) begin + operands[1] = operands_i[1] >> LANE*4; // shift the mask to the right lane end end @@ -462,6 +472,38 @@ or on 16b inputs producing 32b outputs"); .out_ready_i ( out_ready ), .busy_o ( lane_busy[lane] ) ); + end else if (OpGroup == fpnew_pkg::SHFL) begin : lane_instance + fpnew_vfshuffle_multi #( + .FpFmtConfig ( LANE_FORMATS ), + .PipeConfig ( PipeConfig ), + .NumPipeRegs ( NumPipeRegs ), + .SrcWidth ( MAX_FP_WIDTH ), + .TagType ( TagType ), + .AuxType ( logic [AUX_BITS-1:0] ) + ) i_fpnew_vfshuffle_multi ( + .clk_i, + .rst_ni, + .operands_i ( operands ), + .op_i, + .op_mod_i, + .src_fmt_i, + .dst_fmt_i, + .tag_i, + .mask_i ( simd_mask_i[lane] ), // Not used + .aux_i ( aux_data ), + .in_valid_i ( in_valid ), + .in_ready_o ( lane_in_ready[lane] ), + .flush_i, + .result_o ( op_result ), + .status_o ( op_status ), + .extension_bit_o ( lane_ext_bit[lane] ), + .tag_o ( lane_tags[lane] ), + .mask_o ( lane_masks[lane] ), + .aux_o ( lane_aux[lane] ), + .out_valid_o ( out_valid ), + .out_ready_i ( out_ready ), + .busy_o ( lane_busy[lane] ) + ); end // ADD OTHER OPTIONS HERE // Handshakes are only done if the lane is actually used diff --git a/src/fpnew_pkg.sv b/src/fpnew_pkg.sv index 1e8ce099..e3906509 100644 --- a/src/fpnew_pkg.sv +++ b/src/fpnew_pkg.sv @@ -113,11 +113,11 @@ package fpnew_pkg; // -------------- // FP OPERATIONS // -------------- - localparam int unsigned NUM_OPGROUPS = 5; + localparam int unsigned NUM_OPGROUPS = 6; // Each FP operation belongs to an operation group typedef enum logic [2:0] { - ADDMUL, DIVSQRT, NONCOMP, CONV, DOTP + ADDMUL, DIVSQRT, NONCOMP, CONV, DOTP, SHFL } opgroup_e; localparam int unsigned OP_BITS = 5; @@ -127,7 +127,8 @@ package fpnew_pkg; DIV, SQRT, // DIVSQRT operation group SGNJ, MINMAX, CMP, CLASSIFY, // NONCOMP operation group F2F, F2I, I2F, CPKAB, CPKCD, // CONV operation group - SDOTP, EXVSUM, VSUM // DOTP operation group + SDOTP, EXVSUM, VSUM, // DOTP operation group + VFSHFL // SHFL operation group } operation_e; // ------------- @@ -292,7 +293,8 @@ package fpnew_pkg; '{default: MERGED}, // DIVSQRT '{default: PARALLEL}, // NONCOMP '{default: MERGED}, // CONV - '{default: DISABLED}}, // DOTP + '{default: DISABLED}, // DOTP + '{default: DISABLED}}, // SHFL PipeConfig: BEFORE }; @@ -302,7 +304,8 @@ package fpnew_pkg; '{default: DISABLED}, // DIVSQRT '{default: PARALLEL}, // NONCOMP '{default: MERGED}, // CONV - '{default: MERGED}}, // DOTP + '{default: MERGED}, // DOTP + '{default: MERGED}}, // SHFL PipeConfig: BEFORE }; @@ -425,6 +428,7 @@ package fpnew_pkg; SGNJ, MINMAX, CMP, CLASSIFY: return NONCOMP; F2F, F2I, I2F, CPKAB, CPKCD: return CONV; SDOTP, EXVSUM, VSUM: return DOTP; + VFSHFL: return SHFL; default: return NONCOMP; endcase endfunction @@ -437,6 +441,7 @@ package fpnew_pkg; NONCOMP: return 2; CONV: return 3; // vectorial casts use 3 operands DOTP: return 3; // splitting into 5 operands done in wrapper + SHFL: return 3; default: return 0; endcase endfunction @@ -589,4 +594,23 @@ package fpnew_pkg; return res; endfunction + // Returns all lanes that are active for SIMD shuffling + function automatic fmt_logic_t get_vfshfl_lane_formats(int unsigned width, + fmt_logic_t cfg, + int unsigned lane_no); + automatic fmt_logic_t res; + for (int unsigned fmt = 0; fmt < NUM_FP_FORMATS; fmt++) begin + automatic int unsigned simd_lanes = width / fp_width(fp_format_e'(fmt)); + res[fmt] = cfg[fmt] && + (simd_lanes >= 2) && // Only SIMD formats + (simd_lanes > lane_no); + end + // Merge alt and non-alt formats + res[FP16] = res[FP16] || res[FP16ALT]; + res[FP8] = res[FP8] || res[FP8ALT]; + res[FP16ALT] = 1'b0; + res[FP8ALT] = 1'b0; + return res; + endfunction + endpackage diff --git a/src/fpnew_vfshuffle_multi.sv b/src/fpnew_vfshuffle_multi.sv new file mode 100644 index 00000000..b68d8abd --- /dev/null +++ b/src/fpnew_vfshuffle_multi.sv @@ -0,0 +1,219 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// +// Copyright and related rights are licensed under the Solderpad Hardware +// License, Version 0.51 (the "License"); you may not use this file except in +// compliance with the License. You may obtain a copy of the License at +// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +// or agreed to in writing, software, hardware and materials distributed under +// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. +// +// SPDX-License-Identifier: SHL-0.51 + +// Authors: Tim Fischer we can pop it + assign out_ready = out_ready_i | ~out_valid_q; + // Valid: enabled by ready signal, synchronous clear with the flush signal + `FFLARNC(out_valid_q, inp_valid_q, out_ready, flush_i, 1'b0, clk_i, rst_ni) + // Enable register if pipleine ready and a valid data item is present + `FFL(out_result_q, result_out, reg_ena, '0) + `FFL(out_tag_q, inp_tag_q, reg_ena, TagType'('0)) + `FFL(out_mask_q, inp_mask_q, reg_ena, '0) + `FFL(out_aux_q, inp_aux_q, reg_ena, AuxType'('0)) + end else begin : gen_no_out_regs + assign out_ready = out_ready_i; + assign out_valid_q = inp_valid_q; + assign out_result_q = result_out; + assign out_tag_q = inp_tag_q; + assign out_mask_q = inp_mask_q; + assign out_aux_q = inp_aux_q; + end + + // Output signals + assign result_o = out_result_q; + assign tag_o = out_tag_q; + assign mask_o = out_mask_q; + assign aux_o = out_aux_q; + assign out_valid_o = out_valid_q; + assign status_o = fpnew_pkg::status_t'('0); // Not used + assign extension_bit_o = 1'b0; // No NaN-boxing + assign busy_o = inp_valid_q | out_valid_q; + + `ASSERT_INIT(ShflTooManyPipeRegs, !(NumPipeRegs > 1)) + +endmodule