Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SIMD shuffling operations #14

Open
wants to merge 1 commit into
base: pulp
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Bender.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ sources:
- src/fpnew_fma_multi.sv
- src/fpnew_sdotp_multi.sv
- src/fpnew_sdotp_multi_wrapper.sv
- src/fpnew_vfshuffle_multi.sv
- src/fpnew_noncomp.sv
- src/fpnew_opgroup_block.sv
- src/fpnew_opgroup_fmt_slice.sv
Expand Down
44 changes: 43 additions & 1 deletion src/fpnew_opgroup_multifmt_slice.sv
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,8 @@ or on 16b inputs producing 32b outputs");
for (genvar lane = 0; lane < int'(NUM_LANES); lane++) begin : gen_num_lanes
localparam int unsigned LANE = unsigned'(lane); // unsigned to please the linter
// Get a mask of active formats for this lane
localparam fpnew_pkg::fmt_logic_t ACTIVE_FORMATS =
localparam fpnew_pkg::fmt_logic_t ACTIVE_FORMATS = (OpGroup == fpnew_pkg::SHFL) ?
fpnew_pkg::get_vfshfl_lane_formats(Width, FpFmtConfig, LANE) :
fpnew_pkg::get_lane_formats(Width, FpFmtConfig, LANE);
localparam fpnew_pkg::ifmt_logic_t ACTIVE_INT_FORMATS =
fpnew_pkg::get_lane_int_formats(Width, FpFmtConfig, IntFmtConfig, LANE);
Expand All @@ -195,6 +196,11 @@ or on 16b inputs producing 32b outputs");
localparam int unsigned DOTP_MAX_FMT_WIDTH = fpnew_pkg::max_fp_width(DOTP_FORMATS);
localparam int unsigned DOTP_WIDTH = fpnew_pkg::minimum(2*DOTP_MAX_FMT_WIDTH, Width);

// Shuffle-specific parameters
localparam fpnew_pkg::fmt_logic_t SHFL_FORMATS =
fpnew_pkg::get_vfshfl_lane_formats(Width, FpFmtConfig, LANE);
localparam int unsigned SHFL_WIDTH = fpnew_pkg::max_fp_width(SHFL_FORMATS);

// Lane parameters from Opgroup
localparam fpnew_pkg::fmt_logic_t LANE_FORMATS = (OpGroup == fpnew_pkg::CONV) ? CONV_FORMATS :
(OpGroup == fpnew_pkg::DOTP) ? DOTP_FORMATS :
Expand All @@ -211,6 +217,7 @@ or on 16b inputs producing 32b outputs");

logic [NUM_OPERANDS-1:0][LANE_WIDTH-1:0] local_operands; // lane-local oprands
logic [LANE_WIDTH-1:0] op_result; // lane-local results
logic [NUM_OPERANDS-1:0][Width-1:0] operands;
fpnew_pkg::status_t op_status;

logic lane_is_used;
Expand All @@ -221,6 +228,7 @@ or on 16b inputs producing 32b outputs");
// Slice out the operands for this lane, upper bits are ignored in the unit
always_comb begin : prepare_input
for (int unsigned i = 0; i < NUM_OPERANDS; i++) begin
operands[i] = operands_i[i];
local_operands[i] = operands_i[i] >> LANE*fpnew_pkg::fp_width(src_fmt_i);
end

Expand Down Expand Up @@ -248,6 +256,8 @@ or on 16b inputs producing 32b outputs");
local_operands[0] = operands_i[1];
end
end
end else if (OpGroup == fpnew_pkg::SHFL) begin
operands[1] = operands_i[1] >> LANE*4; // shift the mask to the right lane
end
end

Expand Down Expand Up @@ -462,6 +472,38 @@ or on 16b inputs producing 32b outputs");
.out_ready_i ( out_ready ),
.busy_o ( lane_busy[lane] )
);
end else if (OpGroup == fpnew_pkg::SHFL) begin : lane_instance
fpnew_vfshuffle_multi #(
.FpFmtConfig ( LANE_FORMATS ),
.PipeConfig ( PipeConfig ),
.NumPipeRegs ( NumPipeRegs ),
.SrcWidth ( MAX_FP_WIDTH ),
.TagType ( TagType ),
.AuxType ( logic [AUX_BITS-1:0] )
) i_fpnew_vfshuffle_multi (
.clk_i,
.rst_ni,
.operands_i ( operands ),
.op_i,
.op_mod_i,
.src_fmt_i,
.dst_fmt_i,
.tag_i,
.mask_i ( simd_mask_i[lane] ), // Not used
.aux_i ( aux_data ),
.in_valid_i ( in_valid ),
.in_ready_o ( lane_in_ready[lane] ),
.flush_i,
.result_o ( op_result ),
.status_o ( op_status ),
.extension_bit_o ( lane_ext_bit[lane] ),
.tag_o ( lane_tags[lane] ),
.mask_o ( lane_masks[lane] ),
.aux_o ( lane_aux[lane] ),
.out_valid_o ( out_valid ),
.out_ready_i ( out_ready ),
.busy_o ( lane_busy[lane] )
);
end // ADD OTHER OPTIONS HERE

// Handshakes are only done if the lane is actually used
Expand Down
34 changes: 29 additions & 5 deletions src/fpnew_pkg.sv
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,11 @@ package fpnew_pkg;
// --------------
// FP OPERATIONS
// --------------
localparam int unsigned NUM_OPGROUPS = 5;
localparam int unsigned NUM_OPGROUPS = 6;

// Each FP operation belongs to an operation group
typedef enum logic [2:0] {
ADDMUL, DIVSQRT, NONCOMP, CONV, DOTP
ADDMUL, DIVSQRT, NONCOMP, CONV, DOTP, SHFL
} opgroup_e;

localparam int unsigned OP_BITS = 5;
Expand All @@ -127,7 +127,8 @@ package fpnew_pkg;
DIV, SQRT, // DIVSQRT operation group
SGNJ, MINMAX, CMP, CLASSIFY, // NONCOMP operation group
F2F, F2I, I2F, CPKAB, CPKCD, // CONV operation group
SDOTP, EXVSUM, VSUM // DOTP operation group
SDOTP, EXVSUM, VSUM, // DOTP operation group
VFSHFL // SHFL operation group
} operation_e;

// -------------
Expand Down Expand Up @@ -292,7 +293,8 @@ package fpnew_pkg;
'{default: MERGED}, // DIVSQRT
'{default: PARALLEL}, // NONCOMP
'{default: MERGED}, // CONV
'{default: DISABLED}}, // DOTP
'{default: DISABLED}, // DOTP
'{default: DISABLED}}, // SHFL
PipeConfig: BEFORE
};

Expand All @@ -302,7 +304,8 @@ package fpnew_pkg;
'{default: DISABLED}, // DIVSQRT
'{default: PARALLEL}, // NONCOMP
'{default: MERGED}, // CONV
'{default: MERGED}}, // DOTP
'{default: MERGED}, // DOTP
'{default: MERGED}}, // SHFL
PipeConfig: BEFORE
};

Expand Down Expand Up @@ -425,6 +428,7 @@ package fpnew_pkg;
SGNJ, MINMAX, CMP, CLASSIFY: return NONCOMP;
F2F, F2I, I2F, CPKAB, CPKCD: return CONV;
SDOTP, EXVSUM, VSUM: return DOTP;
VFSHFL: return SHFL;
default: return NONCOMP;
endcase
endfunction
Expand All @@ -437,6 +441,7 @@ package fpnew_pkg;
NONCOMP: return 2;
CONV: return 3; // vectorial casts use 3 operands
DOTP: return 3; // splitting into 5 operands done in wrapper
SHFL: return 3;
default: return 0;
endcase
endfunction
Expand Down Expand Up @@ -589,4 +594,23 @@ package fpnew_pkg;
return res;
endfunction

// Returns all lanes that are active for SIMD shuffling
function automatic fmt_logic_t get_vfshfl_lane_formats(int unsigned width,
fmt_logic_t cfg,
int unsigned lane_no);
automatic fmt_logic_t res;
for (int unsigned fmt = 0; fmt < NUM_FP_FORMATS; fmt++) begin
automatic int unsigned simd_lanes = width / fp_width(fp_format_e'(fmt));
res[fmt] = cfg[fmt] &&
(simd_lanes >= 2) && // Only SIMD formats
(simd_lanes > lane_no);
end
// Merge alt and non-alt formats
res[FP16] = res[FP16] || res[FP16ALT];
res[FP8] = res[FP8] || res[FP8ALT];
res[FP16ALT] = 1'b0;
res[FP8ALT] = 1'b0;
return res;
endfunction

endpackage
219 changes: 219 additions & 0 deletions src/fpnew_vfshuffle_multi.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
// Copyright 2024 ETH Zurich and University of Bologna.
//
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
//
// SPDX-License-Identifier: SHL-0.51

// Authors: Tim Fischer <[email protected]

// This unit can be used to shuffle elements of a SIMD vector with a generic mask
// Currently the unit supports two different operations:
// - SHUFFLE: Used if `op_mod_i` is *not* set. Uses only the first operand as input SIMD vector.
// - SHUFFLE2: USed if `op_mod_i` is set. Uses both operand as input SIMD vector.
//
// The operands are expected to be in the following format:
// operands_i[0]: 1st SIMD vector to shuffle
// operands_i[1]: Mask for the shuffle operation
// operands_i[2]: 2nd SIMD vector to shuffle (only used if `op_mod_i` is set)
//
// The mask only uses the LSB 4 bits of the 2nd operand:
// - Bit 3: Select between the two input vectors (only used if `op_mod_i` is set)
// - Bit 2-0: Select the element from the selected vector
//
// Pipeline registers can be inserted before and after the unit by setting the `PipeConfig` parameter.
// However, `AFTER` might be more preferable as the input operands contain the entire SIMD vector, whereas
// the output only contains the selected element. The unit should not be timing-critical,
// therefore only a single pipeline register is supported.

`include "common_cells/registers.svh"
`include "common_cells/assertions.svh"

module fpnew_vfshuffle_multi #(
parameter fpnew_pkg::fmt_logic_t FpFmtConfig = '1,
parameter fpnew_pkg::pipe_config_t PipeConfig = fpnew_pkg::AFTER,
parameter int unsigned NumPipeRegs = 0,
parameter int unsigned SrcWidth = 0,
parameter type TagType = logic,
parameter type AuxType = logic,
// Do not change
localparam int unsigned DstWidth = fpnew_pkg::max_fp_width(FpFmtConfig)
) (
input logic clk_i,
input logic rst_ni,
// Input signals
input logic [2:0][SrcWidth-1:0] operands_i, // 3 operands
input fpnew_pkg::operation_e op_i, // Currently only single shuffle operation
input logic op_mod_i, // Whether to use the second operand
input fpnew_pkg::fp_format_e src_fmt_i, // format of the input operands
input fpnew_pkg::fp_format_e dst_fmt_i, // format of the output operands
input TagType tag_i,
input logic mask_i,
input AuxType aux_i,
// Input Handshake
input logic in_valid_i,
output logic in_ready_o,
input logic flush_i,
// Output signals
output logic [DstWidth-1:0] result_o,
output fpnew_pkg::status_t status_o,
output logic extension_bit_o,
output TagType tag_o,
output logic mask_o,
output AuxType aux_o,
// Output handshake
output logic out_valid_o,
input logic out_ready_i,
// Indication of valid data in flight
output logic busy_o);

// ---------------
// Input registers
// ---------------
logic [2:0][SrcWidth-1:0] inp_operands_q;
fpnew_pkg::operation_e inp_op_q;
logic inp_op_mod_q;
fpnew_pkg::fp_format_e inp_src_fmt_q;
fpnew_pkg::fp_format_e inp_dst_fmt_q;
TagType inp_tag_q;
logic inp_mask_q;
AuxType inp_aux_q;
logic inp_valid_q;
logic inp_ready;
logic out_ready;

// Input stage: Propagate pipeline ready signal to updtream circuitry
assign in_ready_o = inp_ready;
if (PipeConfig == fpnew_pkg::BEFORE && NumPipeRegs == 1) begin : gen_inp_regs
// Internal register enable for this stage
logic reg_ena;
// Determine the ready signal of the current stage - advance the pipeline:
// 1. if the next stage is ready for our data
// 2. if the next stage only holds a bubble (not valid) -> we can pop it
assign inp_ready = out_ready | ~inp_valid_q;
// Valid: enabled by ready signal, synchronous clear with the flush signal
`FFLARNC(inp_valid_q, in_valid_i, inp_ready, flush_i, 1'b0, clk_i, rst_ni)
// Enable register if pipeline ready and a valid data item is present
assign reg_ena = inp_ready & inp_valid_q;
// Generate the pipeline registers within the stages, use enable-registers
`FFL(inp_operands_q, operands_i, reg_ena, '0)
`FFL(inp_op_q, op_i, reg_ena, fpnew_pkg::VFSHFL)
`FFL(inp_op_mod_q, op_mod_i, reg_ena, 1'b0)
`FFL(inp_src_fmt_q, src_fmt_i, reg_ena, fpnew_pkg::fp_format_e'(0))
`FFL(inp_dst_fmt_q, dst_fmt_i, reg_ena, fpnew_pkg::fp_format_e'(0))
`FFL(inp_tag_q, tag_i, reg_ena, TagType'('0))
`FFL(inp_mask_q, mask_i, reg_ena, '0)
`FFL(inp_aux_q, aux_i, reg_ena, AuxType'('0))
end else begin : gen_no_inp_regs
assign inp_ready = out_ready;
assign inp_valid_q = in_valid_i;
assign inp_operands_q = operands_i;
assign inp_op_q = op_i;
assign inp_op_mod_q = op_mod_i;
assign inp_src_fmt_q = src_fmt_i;
assign inp_dst_fmt_q = dst_fmt_i;
assign inp_tag_q = tag_i;
assign inp_mask_q = mask_i;
assign inp_aux_q = aux_i;
end

// ----------------------
// Mask
// ----------------------

logic vec_sel;
logic [2:0] elm_sel;

assign vec_sel = inp_operands_q[1][3];
assign elm_sel = inp_operands_q[1][2:0];

// ----------------------
// Shuffle Logic
// ----------------------

logic [fpnew_pkg::NUM_FP_FORMATS-1:0][DstWidth-1:0] result;
logic [DstWidth-1:0] result_out;

for (genvar f = 0; f < fpnew_pkg::NUM_FP_FORMATS; f++) begin : gen_fmts
// Only implement formats that are enabled
if (FpFmtConfig[f]) begin : gen_fmt
localparam FpWidth = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(f));
localparam NumLanes = SrcWidth/FpWidth;
logic [NumLanes-1:0][FpWidth-1:0] opa_vec, opb_vec;
logic [FpWidth-1:0] opa_elm, opb_elm;
// Convert the operands to a SIMD representation
assign opa_vec = inp_operands_q[0];
assign opb_vec = inp_operands_q[2];
// Select based on element mask
assign opa_elm = opa_vec[elm_sel[$clog2(NumLanes)-1:0]];
assign opb_elm = opb_vec[elm_sel[$clog2(NumLanes)-1:0]];
// Select based on vector mask and op mode
assign result[f] = (vec_sel & inp_op_mod_q) ? opb_elm : opa_elm;
end else begin
assign result[f] = '0;
end
end

// Select result based on destination format
always_comb begin : gen_result
unique case (inp_dst_fmt_q)
fpnew_pkg::FP32: result_out = result[fpnew_pkg::FP32];
fpnew_pkg::FP16,
fpnew_pkg::FP16ALT: result_out = result[fpnew_pkg::FP16];
fpnew_pkg::FP8,
fpnew_pkg::FP8ALT: result_out = result[fpnew_pkg::FP8];
default: result_out = '0;
endcase
end

// ---------------
// Output registers
// ---------------
logic [DstWidth-1:0] out_result_q;
TagType out_tag_q;
logic out_mask_q;
AuxType out_aux_q;
logic out_valid_q;

if (PipeConfig == fpnew_pkg::AFTER && NumPipeRegs == 1) begin : gen_out_regs
// Internal register enable for this stage
logic reg_ena;
// Determine the ready signal of the current stage:
// 1. if the next stage is ready for our data
// 2. if the next stage only holds a bubble (not valid) -> we can pop it
assign out_ready = out_ready_i | ~out_valid_q;
// Valid: enabled by ready signal, synchronous clear with the flush signal
`FFLARNC(out_valid_q, inp_valid_q, out_ready, flush_i, 1'b0, clk_i, rst_ni)
// Enable register if pipleine ready and a valid data item is present
`FFL(out_result_q, result_out, reg_ena, '0)
`FFL(out_tag_q, inp_tag_q, reg_ena, TagType'('0))
`FFL(out_mask_q, inp_mask_q, reg_ena, '0)
`FFL(out_aux_q, inp_aux_q, reg_ena, AuxType'('0))
end else begin : gen_no_out_regs
assign out_ready = out_ready_i;
assign out_valid_q = inp_valid_q;
assign out_result_q = result_out;
assign out_tag_q = inp_tag_q;
assign out_mask_q = inp_mask_q;
assign out_aux_q = inp_aux_q;
end

// Output signals
assign result_o = out_result_q;
assign tag_o = out_tag_q;
assign mask_o = out_mask_q;
assign aux_o = out_aux_q;
assign out_valid_o = out_valid_q;
assign status_o = fpnew_pkg::status_t'('0); // Not used
assign extension_bit_o = 1'b0; // No NaN-boxing
assign busy_o = inp_valid_q | out_valid_q;

`ASSERT_INIT(ShflTooManyPipeRegs, !(NumPipeRegs > 1))

endmodule