From 6682824e94df3528fdd524fc3c796c0f176c9390 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Sat, 26 Oct 2024 15:12:32 +0200 Subject: [PATCH] [hardware] fix segment ops --- hardware/src/ara_dispatcher.sv | 48 +++++++++++++----------- hardware/src/segment_sequencer.sv | 62 +++++++++++++++++++++---------- 2 files changed, 69 insertions(+), 41 deletions(-) diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index d2f3aa6e0..f1a0b671d 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -116,7 +116,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ///////////////////////// ara_req_t ara_req, ara_req_d; - logic ara_req_valid_d; + logic ara_req_valid, ara_req_valid_d; always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin @@ -259,6 +259,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // The handshake signals are just passed-through if the insn is non-segment ara_resp_t ara_resp; + logic ara_resp_valid; + segment_sequencer #( .SegSupport(SegSupport), .ara_req_t (ara_req_t ), @@ -277,6 +279,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( .store_complete_o(store_complete), .ara_req_i(ara_req), .ara_req_o(ara_req_d), + .ara_req_valid_i(ara_req_valid), + .ara_req_valid_o(ara_req_valid_d), .ara_req_ready_i(ara_req_ready_i), .ara_resp_i(ara_resp_i), .ara_resp_o(ara_resp), @@ -360,7 +364,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( cvt_resize : CVT_SAME, default : '0 }; - ara_req_valid_d = 1'b0; + ara_req_valid = 1'b0; is_config = 1'b0; ignore_zero_vl_check = 1'b0; @@ -401,7 +405,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // These generate a reshuffle request to Ara's backend // When LMUL > 1, not all the regs that compose a large // register should always be reshuffled - ara_req_valid_d = ~rs_mask_request_q; + ara_req_valid = ~rs_mask_request_q; ara_req.use_scalar_op = 1'b1; ara_req.vs2 = vs_buffer_q; ara_req.eew_vs2 = eew_old_buffer_q; @@ -602,7 +606,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req.vd = insn.varith_type.rd; ara_req.use_vd = 1'b1; ara_req.vm = insn.varith_type.vm; - ara_req_valid_d = 1'b1; + ara_req_valid = 1'b1; // Decode based on the func6 field unique case (insn.varith_type.func6) @@ -818,7 +822,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req.use_vd = 1'b1; ara_req.vm = insn.varith_type.vm; ara_req.is_stride_np2 = is_stride_np2; - ara_req_valid_d = 1'b1; + ara_req_valid = 1'b1; // Decode based on the func6 field unique case (insn.varith_type.func6) @@ -1031,7 +1035,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req.use_vd = 1'b1; ara_req.vm = insn.varith_type.vm; ara_req.is_stride_np2 = is_stride_np2; - ara_req_valid_d = 1'b1; + ara_req_valid = 1'b1; // Decode based on the func6 field unique case (insn.varith_type.func6) @@ -1237,7 +1241,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req.vd = insn.varith_type.rd; ara_req.use_vd = 1'b1; ara_req.vm = insn.varith_type.vm; - ara_req_valid_d = 1'b1; + ara_req_valid = 1'b1; // Assume an effective EMUL = LMUL1 by default (for the mask operations) ara_req.emul = LMUL_1; @@ -1333,7 +1337,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( acc_resp_o.resp_valid = 1'b1; acc_resp_o.result = ara_resp.resp; acc_resp_o.exception = ara_resp.exception; - ara_req_valid_d = 1'b0; + ara_req_valid = 1'b0; end end 6'b010100: begin @@ -1685,7 +1689,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req.use_vd = 1'b1; ara_req.vm = insn.varith_type.vm; ara_req.is_stride_np2 = is_stride_np2; - ara_req_valid_d = 1'b1; + ara_req_valid = 1'b1; // Decode based on the func6 field unique case (insn.varith_type.func6) @@ -1922,7 +1926,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req.use_vd = 1'b1; ara_req.vm = insn.varith_type.vm; ara_req.fp_rm = acc_req_i.frm; - ara_req_valid_d = 1'b1; + ara_req_valid = 1'b1; // Decode based on the func6 field unique case (insn.varith_type.func6) @@ -2007,7 +2011,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( acc_resp_o.resp_valid = 1'b1; acc_resp_o.result = vfmvfs_result; acc_resp_o.exception = ara_resp.exception; - ara_req_valid_d = 1'b0; + ara_req_valid = 1'b0; end end 6'b011000: ara_req.op = ara_pkg::VMFEQ; @@ -2344,7 +2348,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req.vm = insn.varith_type.vm; ara_req.is_stride_np2 = is_stride_np2; ara_req.fp_rm = acc_req_i.frm; - ara_req_valid_d = 1'b1; + ara_req_valid = 1'b1; // Decode based on the func6 field unique case (insn.varith_type.func6) @@ -2593,7 +2597,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req.vm = insn.vmem_type.vm; ara_req.scalar_op = acc_req_i.rs1; ara_req.nf = insn.vmem_type.nf; - ara_req_valid_d = 1'b1; + ara_req_valid = 1'b1; // Decode the element width // Indexed memory operations follow a different rule @@ -2634,7 +2638,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( acc_resp_o.req_ready = 1'b1; acc_resp_o.resp_valid = 1'b1; illegal_insn = 1'b1; - ara_req_valid_d = 1'b0; + ara_req_valid = 1'b0; end endcase @@ -2749,7 +2753,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ignore_zero_vl_check = 1'b1; // The LMUL value is kept in the instruction itself illegal_insn_load = 1'b0; - ara_req_valid_d = 1'b1; + ara_req_valid = 1'b1; // Maximum vector length. VLMAX = nf * VLEN / EW8. ara_req.vtype.vsew = EW8; @@ -2782,7 +2786,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( acc_resp_o.req_ready = 1'b1; acc_resp_o.resp_valid = 1'b1; acc_resp_o.exception = ara_resp.exception; - ara_req_valid_d = 1'b0; + ara_req_valid = 1'b0; // In case of exception, modify vstart if ( ara_resp.exception.valid ) begin csr_vstart_d = ara_resp.exception_vstart; @@ -2823,7 +2827,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req.vm = insn.vmem_type.vm; ara_req.scalar_op = acc_req_i.rs1; ara_req.nf = insn.vmem_type.nf; - ara_req_valid_d = 1'b1; + ara_req_valid = 1'b1; // Decode the element width // Indexed memory operations follow a different rule @@ -3000,7 +3004,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( acc_resp_o.req_ready = 1'b0; acc_resp_o.resp_valid = 1'b0; - ara_req_valid_d = 1'b1; + ara_req_valid = 1'b1; end // Wait until the back-end answers to acknowledge those instructions @@ -3008,7 +3012,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( acc_resp_o.req_ready = 1'b1; acc_resp_o.resp_valid = 1'b1; acc_resp_o.exception = ara_resp.exception; - ara_req_valid_d = 1'b0; + ara_req_valid = 1'b0; // In case of exception, modify vstart and wait until the previous // operations are over if ( ara_resp.exception.valid ) begin @@ -3273,7 +3277,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Raise an illegal instruction exception if ( illegal_insn || illegal_insn_load || illegal_insn_store ) begin - ara_req_valid_d = 1'b0; + ara_req_valid = 1'b0; acc_resp_o.req_ready = 1'b1; acc_resp_o.resp_valid = 1'b1; acc_resp_o.exception.valid = 1'b1; @@ -3332,7 +3336,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Stall the interface, and inject a reshuffling instruction acc_resp_o.req_ready = 1'b0; acc_resp_o.resp_valid = 1'b0; - ara_req_valid_d = 1'b0; + ara_req_valid = 1'b0; // Initialize the reshuffle counter limit to handle LMUL > 1 unique case (ara_req.emul) @@ -3398,7 +3402,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // delay the zero_vl acknowledge by 1 cycle acc_resp_o.req_ready = ~((is_vload & load_complete_q) | (is_vstore & store_complete_q)); acc_resp_o.resp_valid = ~((is_vload & load_complete_q) | (is_vstore & store_complete_q)); - ara_req_valid_d = 1'b0; + ara_req_valid = 1'b0; load_zero_vl = is_vload; store_zero_vl = is_vstore; end diff --git a/hardware/src/segment_sequencer.sv b/hardware/src/segment_sequencer.sv index 50d4c7e8a..257ca40d2 100644 --- a/hardware/src/segment_sequencer.sv +++ b/hardware/src/segment_sequencer.sv @@ -28,6 +28,8 @@ module segment_sequencer import ara_pkg::*; import rvv_pkg::*; #( // Ara frontend - backend info and handshakes input ara_req_t ara_req_i, output ara_req_t ara_req_o, + input logic ara_req_valid_i, + output logic ara_req_valid_o, input logic ara_req_ready_i, input ara_resp_t ara_resp_i, output ara_resp_t ara_resp_o, @@ -43,10 +45,12 @@ module segment_sequencer import ara_pkg::*; import rvv_pkg::*; #( ara_resp_t ara_resp_d, ara_resp_q; logic is_vload_d, is_vload_q; logic [$bits(ara_req_i.vstart):0] next_vstart_cnt; + logic [2:0] nf_d, nf_q; typedef enum logic [1:0] { IDLE, SEGMENT_MICRO_OPS, + SEGMENT_MICRO_OPS_WAIT_END, SEGMENT_MICRO_OPS_END } state_e; state_e state_d, state_q; @@ -70,7 +74,8 @@ module segment_sequencer import ara_pkg::*; import rvv_pkg::*; #( .q_o(segment_cnt_q), .overflow_o( /* Unused */ ) ); - assign segment_cnt_clear = new_seg_mem_op | (segment_cnt_en & (segment_cnt_q == ara_req_i.nf)); + assign segment_cnt_clear = (state_q == SEGMENT_MICRO_OPS_END) + | ((state_q != IDLE) & segment_cnt_en & (segment_cnt_q == nf_q)); // Track the number of segments logic vstart_cnt_en; @@ -91,7 +96,7 @@ module segment_sequencer import ara_pkg::*; import rvv_pkg::*; #( .overflow_o( /* Unused */ ) ); // Change destination vector index when all the fields of the segment have been processed - assign vstart_cnt_en = segment_cnt_en & (segment_cnt_q == ara_req_i.nf); + assign vstart_cnt_en = segment_cnt_en & (segment_cnt_q == nf_q); // Next vstart count assign next_vstart_cnt = vstart_cnt_q + 1; @@ -104,6 +109,7 @@ module segment_sequencer import ara_pkg::*; import rvv_pkg::*; #( // Pass through ara_req_o = ara_req_i; + ara_req_valid_o = ara_req_valid_i; ara_resp_o = ara_resp_i; ara_resp_valid_o = ara_resp_valid_i; // Block load/store_complete @@ -113,6 +119,7 @@ module segment_sequencer import ara_pkg::*; import rvv_pkg::*; #( ara_resp_d = ara_resp_q; ara_resp_valid_d = ara_resp_valid_q; is_vload_d = is_vload_q; + nf_d = nf_q; // Don't count up by default new_seg_mem_op = 1'b0; @@ -121,56 +128,71 @@ module segment_sequencer import ara_pkg::*; import rvv_pkg::*; #( // Low-perf Moore's FSM unique case (state_q) IDLE: begin + // Pass-through + load_complete_o = load_complete_i; + store_complete_o = store_complete_i; + // Be ready to sample the next nf + nf_d = ara_req_i.nf; // Send a first micro operation upon valid segment mem op if (is_segment_mem_op_i && !illegal_insn_i) begin // If we are here, the backend is able to accept the request // Set-up sequencing new_seg_mem_op = 1'b1; // Set up the first micro operation - ara_req_o.vl = 1; + ara_req_o.vl = next_vstart_cnt; + // Pass to the next field if the previous micro op finished + segment_cnt_en = 1'b1; // Start sequencing state_d = SEGMENT_MICRO_OPS; end end SEGMENT_MICRO_OPS: begin // Manipulate the memory micro request in advance - ara_req_o.vl = 1; + ara_req_o.vl = next_vstart_cnt; ara_req_o.vstart = vstart_cnt_q; ara_req_o.vs1 = ara_req_i.vs1 + segment_cnt_q; ara_req_o.vd = ara_req_i.vd + segment_cnt_q; + + // Don't answer CVA6 yet ara_resp_valid_o = 1'b0; - // Wait for an answer from Ara's backend - if (ara_resp_valid_i) begin - // Pass to the next field if the previous micro op finished + // Pass to the next field if the previous micro op finished + if (ara_req_valid_i && ara_req_ready_i) begin segment_cnt_en = 1'b1; - // If exception, stop the execution + end + + // Wait for an answer from Ara's backend + if (ara_resp_valid_i) begin // If exception, stop the execution if (ara_resp_i.exception.valid) begin - ara_resp_valid_o = ara_resp_valid_i; // If no exception, continue with the micro ops end else begin // If over - stop in the next cycle if (segment_cnt_clear && (next_vstart_cnt == ara_req_i.vl)) begin // Sample the last answer ara_resp_d = ara_resp_i; - ara_resp_valid_d = ara_resp_valid_i; is_vload_d = is_vload_i; - state_d = SEGMENT_MICRO_OPS_END; + state_d = SEGMENT_MICRO_OPS_WAIT_END; end end end end - SEGMENT_MICRO_OPS_END: begin + SEGMENT_MICRO_OPS_WAIT_END: begin + // Don't answer CVA6 yet ara_resp_valid_o = 1'b0; + // Stop injecting micro instructions + ara_req_valid_o = 1'b0; // Wait for idle to give the final load/store_complete - if (ara_idle_i) begin - ara_resp_o = ara_resp_q; - ara_resp_valid_o = ara_resp_valid_q; - load_complete_o = is_vload_q; - store_complete_o = ~is_vload_q; - state_d = IDLE; + if (ara_idle_i && ara_req_ready_i) begin + state_d = SEGMENT_MICRO_OPS_END; end end + SEGMENT_MICRO_OPS_END: begin + ara_resp_o = ara_resp_q; + ara_resp_valid_o = 1'b1; + load_complete_o = is_vload_q; + store_complete_o = ~is_vload_q; + state_d = IDLE; + end default:; endcase end @@ -178,14 +200,15 @@ module segment_sequencer import ara_pkg::*; import rvv_pkg::*; #( always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin state_q <= IDLE; + nf_q <= '0; is_vload_q <= 1'b0; ara_resp_q <= '0; ara_resp_valid_q <= '0; end else begin state_q <= state_d; + nf_q <= nf_d; is_vload_q <= is_vload_d; ara_resp_q <= ara_resp_d; - ara_resp_valid_q <= ara_resp_valid_d; end end end else begin : gen_no_segment_support @@ -195,6 +218,7 @@ module segment_sequencer import ara_pkg::*; import rvv_pkg::*; #( assign load_complete_o = load_complete_i; assign store_complete_o = store_complete_i; assign ara_req_o = ara_req_i; + assign ara_req_valid_o = ara_req_valid_i; assign ara_resp_o = ara_resp_i; assign ara_resp_valid_o = ara_resp_valid_i; end