diff --git a/bhv/cv32e40p_tb_wrapper.sv b/bhv/cv32e40p_tb_wrapper.sv index b71cf970c..97fdfaa5f 100644 --- a/bhv/cv32e40p_tb_wrapper.sv +++ b/bhv/cv32e40p_tb_wrapper.sv @@ -210,7 +210,7 @@ module cv32e40p_tb_wrapper .apu_en_i (cv32e40p_top_i.apu_req), .apu_singlecycle_i(cv32e40p_top_i.core_i.ex_stage_i.apu_singlecycle), .apu_multicycle_i (cv32e40p_top_i.core_i.ex_stage_i.apu_multicycle), - .apu_rvalid_i (cv32e40p_top_i.apu_rvalid) + .apu_rvalid_i (cv32e40p_top_i.core_i.ex_stage_i.apu_valid) ); `endif @@ -344,7 +344,7 @@ module cv32e40p_tb_wrapper // APU .apu_req_i (cv32e40p_top_i.core_i.apu_req_o), .apu_gnt_i (cv32e40p_top_i.core_i.apu_gnt_i), - .apu_rvalid_i(cv32e40p_top_i.core_i.apu_rvalid_i), + .apu_rvalid_i(cv32e40p_top_i.core_i.ex_stage_i.apu_valid), // Controller FSM probes .ctrl_fsm_cs_i(cv32e40p_top_i.core_i.id_stage_i.controller_i.ctrl_fsm_cs), diff --git a/rtl/cv32e40p_apu_disp.sv b/rtl/cv32e40p_apu_disp.sv index adc9a3485..94ca9bcbd 100644 --- a/rtl/cv32e40p_apu_disp.sv +++ b/rtl/cv32e40p_apu_disp.sv @@ -47,6 +47,7 @@ module cv32e40p_apu_disp ( input logic [2:0][5:0] read_regs_i, input logic [2:0] read_regs_valid_i, output logic read_dep_o, + output logic read_dep_for_jalr_o, input logic [1:0][5:0] write_regs_i, input logic [1:0] write_regs_valid_i, @@ -189,6 +190,10 @@ module cv32e40p_apu_disp ( assign read_dep_o = (read_dep_req | read_dep_inflight | read_dep_waiting) & is_decoding_i; assign write_dep_o = (write_dep_req | write_dep_inflight | write_dep_waiting) & is_decoding_i; + assign read_dep_for_jalr_o = is_decoding_i & ((|read_deps_req & enable_i) | + (|read_deps_inflight & valid_inflight) | + (|read_deps_waiting & valid_waiting)); + // // Stall signals // diff --git a/rtl/cv32e40p_controller.sv b/rtl/cv32e40p_controller.sv index 509c40f82..25674f2d3 100644 --- a/rtl/cv32e40p_controller.sv +++ b/rtl/cv32e40p_controller.sv @@ -31,7 +31,8 @@ module cv32e40p_controller import cv32e40p_pkg::*; #( parameter COREV_CLUSTER = 0, - parameter COREV_PULP = 1 + parameter COREV_PULP = 0, + parameter FPU = 0 ) ( input logic clk, // Gated clock @@ -104,6 +105,7 @@ module cv32e40p_controller import cv32e40p_pkg::*; // APU dependency checks input logic apu_en_i, input logic apu_read_dep_i, + input logic apu_read_dep_for_jalr_i, input logic apu_write_dep_i, output logic apu_stall_o, @@ -1338,7 +1340,10 @@ endgenerate if ((ctrl_transfer_insn_in_dec_i == BRANCH_JALR) && (((regfile_we_wb_i == 1'b1) && (reg_d_wb_is_reg_a_i == 1'b1)) || ((regfile_we_ex_i == 1'b1) && (reg_d_ex_is_reg_a_i == 1'b1)) || - ((regfile_alu_we_fw_i == 1'b1) && (reg_d_alu_is_reg_a_i == 1'b1))) ) + ((regfile_alu_we_fw_i == 1'b1) && (reg_d_alu_is_reg_a_i == 1'b1)) || + (FPU && (apu_read_dep_for_jalr_i == 1'b1)) + ) + ) begin jr_stall_o = 1'b1; deassert_we_o = 1'b1; diff --git a/rtl/cv32e40p_core.sv b/rtl/cv32e40p_core.sv index 528ac1c8f..899492da4 100644 --- a/rtl/cv32e40p_core.sv +++ b/rtl/cv32e40p_core.sv @@ -213,6 +213,7 @@ module cv32e40p_core logic [ 2:0][ 5:0] apu_read_regs; logic [ 2:0] apu_read_regs_valid; logic apu_read_dep; + logic apu_read_dep_for_jalr; logic [ 1:0][ 5:0] apu_write_regs; logic [ 1:0] apu_write_regs_valid; logic apu_write_dep; @@ -361,7 +362,6 @@ module cv32e40p_core // APU master signals assign apu_flags_o = apu_flags_ex; - assign fflags_csr = apu_flags_i; ////////////////////////////////////////////////////////////////////////////////////////////// // ____ _ _ __ __ _ // @@ -621,14 +621,15 @@ module cv32e40p_core .apu_flags_ex_o (apu_flags_ex), .apu_waddr_ex_o (apu_waddr_ex), - .apu_read_regs_o (apu_read_regs), - .apu_read_regs_valid_o (apu_read_regs_valid), - .apu_read_dep_i (apu_read_dep), - .apu_write_regs_o (apu_write_regs), - .apu_write_regs_valid_o(apu_write_regs_valid), - .apu_write_dep_i (apu_write_dep), - .apu_perf_dep_o (perf_apu_dep), - .apu_busy_i (apu_busy), + .apu_read_regs_o (apu_read_regs), + .apu_read_regs_valid_o (apu_read_regs_valid), + .apu_read_dep_i (apu_read_dep), + .apu_read_dep_for_jalr_i(apu_read_dep_for_jalr), + .apu_write_regs_o (apu_write_regs), + .apu_write_regs_valid_o (apu_write_regs_valid), + .apu_write_dep_i (apu_write_dep), + .apu_perf_dep_o (perf_apu_dep), + .apu_busy_i (apu_busy), // CSR ID/EX .csr_access_ex_o (csr_access_ex), @@ -779,8 +780,12 @@ module cv32e40p_core .mult_multicycle_o(mult_multicycle), // to ID/EX pipe registers + .data_misaligned_ex_i(data_misaligned_ex), // from ID/EX pipeline + .data_misaligned_i (data_misaligned), + // FPU .fpu_fflags_we_o(fflags_we), + .fpu_fflags_o (fflags_csr), // APU .apu_en_i (apu_en_ex), @@ -788,14 +793,14 @@ module cv32e40p_core .apu_lat_i (apu_lat_ex), .apu_operands_i(apu_operands_ex), .apu_waddr_i (apu_waddr_ex), - .apu_flags_i (apu_flags_ex), - .apu_read_regs_i (apu_read_regs), - .apu_read_regs_valid_i (apu_read_regs_valid), - .apu_read_dep_o (apu_read_dep), - .apu_write_regs_i (apu_write_regs), - .apu_write_regs_valid_i(apu_write_regs_valid), - .apu_write_dep_o (apu_write_dep), + .apu_read_regs_i (apu_read_regs), + .apu_read_regs_valid_i (apu_read_regs_valid), + .apu_read_dep_o (apu_read_dep), + .apu_read_dep_for_jalr_o(apu_read_dep_for_jalr), + .apu_write_regs_i (apu_write_regs), + .apu_write_regs_valid_i (apu_write_regs_valid), + .apu_write_dep_o (apu_write_dep), .apu_perf_type_o(perf_apu_type), .apu_perf_cont_o(perf_apu_cont), @@ -813,6 +818,7 @@ module cv32e40p_core // response channel .apu_rvalid_i (apu_rvalid_i), .apu_result_i (apu_result_i), + .apu_flags_i (apu_flags_i), .lsu_en_i (data_req_ex), .lsu_rdata_i(lsu_rdata), @@ -901,8 +907,6 @@ module cv32e40p_core .data_misaligned_ex_i(data_misaligned_ex), // from ID/EX pipeline .data_misaligned_o (data_misaligned), - .apu_busy_i(apu_busy), - .p_elw_start_o (p_elw_start), .p_elw_finish_o(p_elw_finish), diff --git a/rtl/cv32e40p_ex_stage.sv b/rtl/cv32e40p_ex_stage.sv index 6b58a8425..08392f29f 100644 --- a/rtl/cv32e40p_ex_stage.sv +++ b/rtl/cv32e40p_ex_stage.sv @@ -76,8 +76,12 @@ module cv32e40p_ex_stage output logic mult_multicycle_o, + input logic data_misaligned_ex_i, + input logic data_misaligned_i, + // FPU signals output logic fpu_fflags_we_o, + output logic [APU_NUSFLAGS_CPU-1:0] fpu_fflags_o, // APU signals input logic apu_en_i, @@ -85,11 +89,12 @@ module cv32e40p_ex_stage input logic [ 1:0] apu_lat_i, input logic [ APU_NARGS_CPU-1:0][31:0] apu_operands_i, input logic [ 5:0] apu_waddr_i, - input logic [APU_NDSFLAGS_CPU-1:0] apu_flags_i, + input logic [APU_NUSFLAGS_CPU-1:0] apu_flags_i, input logic [2:0][5:0] apu_read_regs_i, input logic [2:0] apu_read_regs_valid_i, output logic apu_read_dep_o, + output logic apu_read_dep_for_jalr_o, input logic [1:0][5:0] apu_write_regs_i, input logic [1:0] apu_write_regs_valid_i, output logic apu_write_dep_o, @@ -143,7 +148,7 @@ module cv32e40p_ex_stage output logic branch_decision_o, // Stall Control - input logic is_decoding_i, // Used to mask data Dependency inside the APU dispatcher in case of an istruction non valid + input logic is_decoding_i, // Used to mask data Dependency inside the APU dispatcher in case of an istruction non valid input logic lsu_ready_ex_i, // EX part of LSU is done input logic lsu_err_i, @@ -152,29 +157,34 @@ module cv32e40p_ex_stage input logic wb_ready_i // WB stage ready for new data ); - logic [31:0] alu_result; - logic [31:0] mult_result; - logic alu_cmp_result; + logic [ 31:0] alu_result; + logic [ 31:0] mult_result; + logic alu_cmp_result; - logic regfile_we_lsu; - logic [ 5:0] regfile_waddr_lsu; + logic regfile_we_lsu; + logic [ 5:0] regfile_waddr_lsu; - logic wb_contention; - logic wb_contention_lsu; + logic wb_contention; + logic wb_contention_lsu; - logic alu_ready; - logic mult_ready; + logic alu_ready; + logic mulh_active; + logic mult_ready; // APU signals - logic apu_valid; - logic [ 5:0] apu_waddr; - logic [31:0] apu_result; - logic apu_stall; - logic apu_active; - logic apu_singlecycle; - logic apu_multicycle; - logic apu_req; - logic apu_gnt; + logic apu_valid; + logic [ 5:0] apu_waddr; + logic [ 31:0] apu_result; + logic apu_stall; + logic apu_active; + logic apu_singlecycle; + logic apu_multicycle; + logic apu_req; + logic apu_gnt; + + logic apu_rvalid_q; + logic [ 31:0] apu_result_q; + logic [APU_NUSFLAGS_CPU-1:0] apu_flags_q; // ALU write port mux always_comb begin @@ -295,9 +305,10 @@ module cv32e40p_ex_stage .result_o(mult_result), - .multicycle_o(mult_multicycle_o), - .ready_o (mult_ready), - .ex_ready_i (ex_ready_o) + .multicycle_o (mult_multicycle_o), + .mulh_active_o(mulh_active), + .ready_o (mult_ready), + .ex_ready_i (ex_ready_o) ); generate @@ -326,13 +337,14 @@ module cv32e40p_ex_stage .active_o(apu_active), .stall_o (apu_stall), - .is_decoding_i (is_decoding_i), - .read_regs_i (apu_read_regs_i), - .read_regs_valid_i (apu_read_regs_valid_i), - .read_dep_o (apu_read_dep_o), - .write_regs_i (apu_write_regs_i), - .write_regs_valid_i(apu_write_regs_valid_i), - .write_dep_o (apu_write_dep_o), + .is_decoding_i (is_decoding_i), + .read_regs_i (apu_read_regs_i), + .read_regs_valid_i (apu_read_regs_valid_i), + .read_dep_o (apu_read_dep_o), + .read_dep_for_jalr_o(apu_read_dep_for_jalr_o), + .write_regs_i (apu_write_regs_i), + .write_regs_valid_i (apu_write_regs_valid_i), + .write_dep_o (apu_write_dep_o), .perf_type_o(apu_perf_type_o), .perf_cont_o(apu_perf_cont_o), @@ -345,40 +357,60 @@ module cv32e40p_ex_stage .apu_rvalid_i(apu_valid) ); - assign apu_perf_wb_o = wb_contention | wb_contention_lsu; - assign apu_ready_wb_o = ~(apu_active | apu_en_i | apu_stall) | apu_valid; + assign apu_perf_wb_o = wb_contention | wb_contention_lsu; + assign apu_ready_wb_o = ~(apu_active | apu_en_i | apu_stall) | apu_valid; + + /////////////////////////////////////// + // APU result memorization Register // + /////////////////////////////////////// + always_ff @(posedge clk, negedge rst_n) begin : APU_Result_Memorization + if (~rst_n) begin + apu_rvalid_q <= 1'b0; + apu_result_q <= 'b0; + apu_flags_q <= 'b0; + end else begin + if (apu_rvalid_i && apu_multicycle && (data_misaligned_i || data_misaligned_ex_i || regfile_alu_we_i || (mulh_active && (mult_operator_i == MUL_H)))) begin + apu_rvalid_q <= 1'b1; + apu_result_q <= apu_result_i; + apu_flags_q <= apu_flags_i; + end else if (apu_rvalid_q && !(data_misaligned_i || data_misaligned_ex_i || regfile_alu_we_i || (mulh_active && (mult_operator_i == MUL_H)))) begin + apu_rvalid_q <= 1'b0; + end + end + end - assign apu_req_o = apu_req; - assign apu_gnt = apu_gnt_i; - assign apu_valid = apu_rvalid_i; - assign apu_operands_o = apu_operands_i; - assign apu_op_o = apu_op_i; - assign apu_result = apu_result_i; + assign apu_req_o = apu_req; + assign apu_gnt = apu_gnt_i; + assign apu_valid = (apu_multicycle && (data_misaligned_i || data_misaligned_ex_i || regfile_alu_we_i || (mulh_active && (mult_operator_i == MUL_H)))) ? 1'b0 : (apu_rvalid_i || apu_rvalid_q); + assign apu_operands_o = apu_operands_i; + assign apu_op_o = apu_op_i; + assign apu_result = apu_rvalid_q ? apu_result_q : apu_result_i; assign fpu_fflags_we_o = apu_valid; + assign fpu_fflags_o = apu_rvalid_q ? apu_flags_q : apu_flags_i; end else begin : gen_no_apu // default assignements for the case when no FPU/APU is attached. - assign apu_req_o = '0; - assign apu_operands_o[0] = '0; - assign apu_operands_o[1] = '0; - assign apu_operands_o[2] = '0; - assign apu_op_o = '0; - assign apu_req = 1'b0; - assign apu_gnt = 1'b0; - assign apu_result = 32'b0; - assign apu_valid = 1'b0; - assign apu_waddr = 6'b0; - assign apu_stall = 1'b0; - assign apu_active = 1'b0; - assign apu_ready_wb_o = 1'b1; - assign apu_perf_wb_o = 1'b0; - assign apu_perf_cont_o = 1'b0; - assign apu_perf_type_o = 1'b0; - assign apu_singlecycle = 1'b0; - assign apu_multicycle = 1'b0; - assign apu_read_dep_o = 1'b0; - assign apu_write_dep_o = 1'b0; - assign fpu_fflags_we_o = 1'b0; - + assign apu_req_o = '0; + assign apu_operands_o[0] = '0; + assign apu_operands_o[1] = '0; + assign apu_operands_o[2] = '0; + assign apu_op_o = '0; + assign apu_req = 1'b0; + assign apu_gnt = 1'b0; + assign apu_result = 32'b0; + assign apu_valid = 1'b0; + assign apu_waddr = 6'b0; + assign apu_stall = 1'b0; + assign apu_active = 1'b0; + assign apu_ready_wb_o = 1'b1; + assign apu_perf_wb_o = 1'b0; + assign apu_perf_cont_o = 1'b0; + assign apu_perf_type_o = 1'b0; + assign apu_singlecycle = 1'b0; + assign apu_multicycle = 1'b0; + assign apu_read_dep_o = 1'b0; + assign apu_read_dep_for_jalr_o = 1'b0; + assign apu_write_dep_o = 1'b0; + assign fpu_fflags_o = '0; end endgenerate diff --git a/rtl/cv32e40p_id_stage.sv b/rtl/cv32e40p_id_stage.sv index c8d901a18..7b7f85cbd 100644 --- a/rtl/cv32e40p_id_stage.sv +++ b/rtl/cv32e40p_id_stage.sv @@ -146,6 +146,7 @@ module cv32e40p_id_stage output logic [2:0][5:0] apu_read_regs_o, output logic [2:0] apu_read_regs_valid_o, input logic apu_read_dep_i, + input logic apu_read_dep_for_jalr_i, output logic [1:0][5:0] apu_write_regs_o, output logic [1:0] apu_write_regs_valid_o, input logic apu_write_dep_i, @@ -804,6 +805,12 @@ module cv32e40p_id_stage // dependency checks always_comb begin unique case (alu_op_a_mux_sel) + OP_A_CURRPC: begin + if (ctrl_transfer_target_mux_sel == JT_JALR) begin + apu_read_regs[0] = regfile_addr_ra_id; + apu_read_regs_valid[0] = 1'b1; + end + end // OP_A_CURRPC: OP_A_REGA_OR_FWD: begin apu_read_regs[0] = regfile_addr_ra_id; apu_read_regs_valid[0] = 1'b1; @@ -847,7 +854,7 @@ module cv32e40p_id_stage apu_read_regs_valid[2] = 1'b1; end OP_C_REGC_OR_FWD: begin - if (alu_op_a_mux_sel != OP_A_REGC_OR_FWD) begin + if ((alu_op_a_mux_sel != OP_A_REGC_OR_FWD) && (ctrl_transfer_target_mux_sel != JT_JALR)) begin apu_read_regs[2] = regfile_addr_rc_id; apu_read_regs_valid[2] = 1'b1; end else begin @@ -1089,7 +1096,8 @@ module cv32e40p_id_stage cv32e40p_controller #( .COREV_CLUSTER(COREV_CLUSTER), - .COREV_PULP (COREV_PULP) + .COREV_PULP (COREV_PULP), + .FPU (FPU) ) controller_i ( .clk (clk), // Gated clock .clk_ungated_i(clk_ungated_i), // Ungated clock @@ -1158,9 +1166,10 @@ module cv32e40p_id_stage .mult_multicycle_i(mult_multicycle_i), // APU - .apu_en_i (apu_en), - .apu_read_dep_i (apu_read_dep_i), - .apu_write_dep_i(apu_write_dep_i), + .apu_en_i (apu_en), + .apu_read_dep_i (apu_read_dep_i), + .apu_read_dep_for_jalr_i(apu_read_dep_for_jalr_i), + .apu_write_dep_i (apu_write_dep_i), .apu_stall_o(apu_stall), diff --git a/rtl/cv32e40p_load_store_unit.sv b/rtl/cv32e40p_load_store_unit.sv index 8df1d8498..7c08ffe11 100644 --- a/rtl/cv32e40p_load_store_unit.sv +++ b/rtl/cv32e40p_load_store_unit.sv @@ -59,8 +59,6 @@ module cv32e40p_load_store_unit #( input logic data_misaligned_ex_i, // misaligned access in last ld/st -> from ID/EX pipeline output logic data_misaligned_o, // misaligned access was detected -> to controller - input logic apu_busy_i, - input logic [5:0] data_atop_ex_i, // atomic instructions signal -> from ex stage output logic [5:0] data_atop_o, // atomic instruction signal -> core output @@ -76,8 +74,6 @@ module cv32e40p_load_store_unit #( localparam DEPTH = 2; // Maximum number of outstanding transactions - logic data_req_ex_filtered; // data request from ex stage filtered when it is misaligned and there is an on-going APU instruction - // Transaction request (to cv32e40p_obi_interface) logic trans_valid; logic trans_ready; @@ -352,14 +348,12 @@ module cv32e40p_load_store_unit #( // Busy if there are ongoing (or potentially outstanding) transfers assign busy_o = (cnt_q != 2'b00) || trans_valid; - assign data_req_ex_filtered = data_req_ex_i & !(apu_busy_i & (data_misaligned_o | data_misaligned_ex_i)); - ////////////////////////////////////////////////////////////////////////////// // Transaction request generation // // Assumes that corresponding response is at least 1 cycle after request // - // - Only request transaction when EX stage requires data transfer (data_req_ex_filtered), and + // - Only request transaction when EX stage requires data transfer (data_req_ex_i), and // - maximum number of outstanding transactions will not be exceeded (cnt_q < DEPTH) ////////////////////////////////////////////////////////////////////////////// @@ -376,12 +370,12 @@ module cv32e40p_load_store_unit #( // OBI compatible (avoids combinatorial path from data_rvalid_i to data_req_o). // Multiple trans_* transactions can be issued (and accepted) before a response // (resp_*) is received. - assign trans_valid = data_req_ex_filtered && (cnt_q < DEPTH); + assign trans_valid = data_req_ex_i && (cnt_q < DEPTH); end else begin : gen_pulp_obi // Legacy PULP OBI behavior, i.e. only issue subsequent transaction if preceding transfer // is about to finish (re-introducing timing critical path from data_rvalid_i to data_req_o) - assign trans_valid = (cnt_q == 2'b00) ? data_req_ex_filtered && (cnt_q < DEPTH) : - data_req_ex_filtered && (cnt_q < DEPTH) && resp_valid; + assign trans_valid = (cnt_q == 2'b00) ? data_req_ex_i && (cnt_q < DEPTH) : + data_req_ex_i && (cnt_q < DEPTH) && resp_valid; end endgenerate @@ -391,7 +385,7 @@ module cv32e40p_load_store_unit #( // LSU EX stage readyness requires two criteria to be met: // - // - A data request (data_req_ex_filtered) has been forwarded/accepted (trans_valid && trans_ready) + // - A data request (data_req_ex_i) has been forwarded/accepted (trans_valid && trans_ready) // - The LSU WB stage is available such that EX and WB can be updated in lock step // // Default (if there is not even a data request) LSU EX is signaled to be ready, else @@ -400,11 +394,10 @@ module cv32e40p_load_store_unit #( // in case there is already at least one outstanding transaction (so WB is full) the EX // and WB stage can only signal readiness in lock step (so resp_valid is used as well). - assign lsu_ready_ex_o = !(apu_busy_i & (data_misaligned_o | data_misaligned_ex_i)) & - ((data_req_ex_i == 1'b0) ? 1'b1 : - (cnt_q == 2'b00) ? ( trans_valid && trans_ready) : - (cnt_q == 2'b01) ? (resp_valid && trans_valid && trans_ready) : - resp_valid); + assign lsu_ready_ex_o = (data_req_ex_i == 1'b0) ? 1'b1 : + (cnt_q == 2'b00) ? ( trans_valid && trans_ready) : + (cnt_q == 2'b01) ? (resp_valid && trans_valid && trans_ready) : + resp_valid; // Update signals for EX/WB registers (when EX has valid data itself and is ready for next) assign ctrl_update = lsu_ready_ex_o && data_req_ex_i; diff --git a/rtl/cv32e40p_mult.sv b/rtl/cv32e40p_mult.sv index ea0da1937..afdc5e2fc 100644 --- a/rtl/cv32e40p_mult.sv +++ b/rtl/cv32e40p_mult.sv @@ -55,6 +55,7 @@ module cv32e40p_mult output logic [31:0] result_o, output logic multicycle_o, + output logic mulh_active_o, output logic ready_o, input logic ex_ready_i ); @@ -87,7 +88,6 @@ module cv32e40p_mult logic [ 1:0] mulh_signed; logic mulh_shift_arith; logic mulh_carry_q; - logic mulh_active; logic mulh_save; logic mulh_clearcarry; logic mulh_ready; @@ -105,7 +105,7 @@ module cv32e40p_mult assign short_op_a[16] = short_signed[0] & short_op_a[15]; assign short_op_b[16] = short_signed[1] & short_op_b[15]; - assign short_op_c = mulh_active ? $signed({mulh_carry_q, op_c_i}) : $signed(op_c_i); + assign short_op_c = mulh_active_o ? $signed({mulh_carry_q, op_c_i}) : $signed(op_c_i); assign short_mul = $signed(short_op_a) * $signed(short_op_b); assign short_mac = $signed(short_op_c) + $signed(short_mul) + $signed(short_round); @@ -116,13 +116,13 @@ module cv32e40p_mult ) >>> short_imm; // choose between normal short multiplication operation and mulh operation - assign short_imm = mulh_active ? mulh_imm : imm_i; - assign short_subword = mulh_active ? mulh_subword : {2{short_subword_i}}; - assign short_signed = mulh_active ? mulh_signed : short_signed_i; - assign short_shift_arith = mulh_active ? mulh_shift_arith : short_signed_i[0]; + assign short_imm = mulh_active_o ? mulh_imm : imm_i; + assign short_subword = mulh_active_o ? mulh_subword : {2{short_subword_i}}; + assign short_signed = mulh_active_o ? mulh_signed : short_signed_i; + assign short_shift_arith = mulh_active_o ? mulh_shift_arith : short_signed_i[0]; - assign short_mac_msb1 = mulh_active ? short_mac[33] : short_mac[31]; - assign short_mac_msb0 = mulh_active ? short_mac[32] : short_mac[31]; + assign short_mac_msb1 = mulh_active_o ? short_mac[33] : short_mac[31]; + assign short_mac_msb0 = mulh_active_o ? short_mac[32] : short_mac[31]; always_comb begin @@ -132,16 +132,16 @@ module cv32e40p_mult mulh_signed = 2'b00; mulh_shift_arith = 1'b0; mulh_ready = 1'b0; - mulh_active = 1'b1; + mulh_active_o = 1'b1; mulh_save = 1'b0; mulh_clearcarry = 1'b0; multicycle_o = 1'b0; case (mulh_CS) IDLE_MULT: begin - mulh_active = 1'b0; - mulh_ready = 1'b1; - mulh_save = 1'b0; + mulh_active_o = 1'b0; + mulh_ready = 1'b1; + mulh_save = 1'b0; if ((operator_i == MUL_H) && enable_i) begin mulh_ready = 1'b0; mulh_NS = STEP0; @@ -149,12 +149,12 @@ module cv32e40p_mult end STEP0: begin - multicycle_o = 1'b1; - mulh_imm = 5'd16; - mulh_active = 1'b1; + multicycle_o = 1'b1; + mulh_imm = 5'd16; + mulh_active_o = 1'b1; //AL*BL never overflows - mulh_save = 1'b0; - mulh_NS = STEP1; + mulh_save = 1'b0; + mulh_NS = STEP1; //Here always a 32'b unsigned result (no carry) end diff --git a/rtl/vendor/pulp_platform_fpnew.lock.hjson b/rtl/vendor/pulp_platform_fpnew.lock.hjson index e150bcb2f..de40549d5 100644 --- a/rtl/vendor/pulp_platform_fpnew.lock.hjson +++ b/rtl/vendor/pulp_platform_fpnew.lock.hjson @@ -9,6 +9,6 @@ upstream: { url: https://github.com/pulp-platform/fpnew.git - rev: 11659d7ff3580ac3226c6d56a90ef717cdc530e3 + rev: 79e453139072df42c9ec8f697132ba485d74e23d } } diff --git a/rtl/vendor/pulp_platform_fpnew.vendor.hjson b/rtl/vendor/pulp_platform_fpnew.vendor.hjson index e76745d51..1fe09cca5 100644 --- a/rtl/vendor/pulp_platform_fpnew.vendor.hjson +++ b/rtl/vendor/pulp_platform_fpnew.vendor.hjson @@ -7,7 +7,7 @@ upstream: { url: "https://github.com/pulp-platform/fpnew.git", - rev: "11659d7ff3580ac3226c6d56a90ef717cdc530e3", + rev: "79e453139072df42c9ec8f697132ba485d74e23d", }, exclude_from_upstream: [ diff --git a/rtl/vendor/pulp_platform_fpnew/src/fpnew_cast_multi.sv b/rtl/vendor/pulp_platform_fpnew/src/fpnew_cast_multi.sv index 964ef7429..7abe33043 100644 --- a/rtl/vendor/pulp_platform_fpnew/src/fpnew_cast_multi.sv +++ b/rtl/vendor/pulp_platform_fpnew/src/fpnew_cast_multi.sv @@ -443,7 +443,11 @@ module fpnew_cast_multi #( // By default right shift mantissa to be an integer denorm_shamt = unsigned'(MAX_INT_WIDTH - 1 - input_exp_q); // overflow: when converting to unsigned the range is larger by one - if (input_exp_q >= signed'(fpnew_pkg::int_width(int_fmt_q2) - 1 + op_mod_q2)) begin + if ((input_exp_q >= signed'(fpnew_pkg::int_width(int_fmt_q2) - 1 + op_mod_q2)) // Exponent larger than max int range, + && !(!op_mod_q2 // unless cast to signed int + && input_sign_q // and input value is larges negative int value + && (input_exp_q == signed'(fpnew_pkg::int_width(int_fmt_q2) - 1)) + && (input_mant_q == {1'b1, {INT_MAN_WIDTH-1{1'b0}}}))) begin denorm_shamt = '0; // prevent shifting of_before_round = 1'b1; // underflow diff --git a/rtl/vendor/pulp_platform_fpnew/src/fpnew_divsqrt_multi.sv b/rtl/vendor/pulp_platform_fpnew/src/fpnew_divsqrt_multi.sv index a8b004952..56a2f5d62 100644 --- a/rtl/vendor/pulp_platform_fpnew/src/fpnew_divsqrt_multi.sv +++ b/rtl/vendor/pulp_platform_fpnew/src/fpnew_divsqrt_multi.sv @@ -207,7 +207,7 @@ module fpnew_divsqrt_multi #( // Valid synch with other lanes // When one divsqrt unit completes an operation, keep its done high, waiting for the other lanes // As soon as all the lanes are over, we can clear this FF and start with a new operation - `FFLARNC(unit_done_q, unit_done, unit_done, simd_synch_done, 1'b0, clk_i, rst_ni); + `FFLARNC(unit_done_q, unit_done, unit_done, simd_synch_done, 1'b0, clk_i, rst_ni) // Tell the other units that this unit has finished now or in the past assign divsqrt_done_o = (unit_done_q | unit_done) & result_vec_op_q; diff --git a/rtl/vendor/pulp_platform_fpnew/src/fpnew_fma.sv b/rtl/vendor/pulp_platform_fpnew/src/fpnew_fma.sv index 051e6a698..6fdd89056 100644 --- a/rtl/vendor/pulp_platform_fpnew/src/fpnew_fma.sv +++ b/rtl/vendor/pulp_platform_fpnew/src/fpnew_fma.sv @@ -613,7 +613,9 @@ module fpnew_fma #( ); // Classification after rounding - assign uf_after_round = rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '0; // exponent = 0 + assign uf_after_round = (rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '0) // denormal + || ((pre_round_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '0) && (rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == 1) && + ((round_sticky_bits != 2'b11) || (!sum_sticky_bits[MAN_BITS*2 + 4] && ((rnd_mode_i == fpnew_pkg::RNE) || (rnd_mode_i == fpnew_pkg::RMM))))); assign of_after_round = rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '1; // exponent all ones // ----------------- diff --git a/rtl/vendor/pulp_platform_fpnew/src/fpnew_fma_multi.sv b/rtl/vendor/pulp_platform_fpnew/src/fpnew_fma_multi.sv index e691f6777..471d966f0 100644 --- a/rtl/vendor/pulp_platform_fpnew/src/fpnew_fma_multi.sv +++ b/rtl/vendor/pulp_platform_fpnew/src/fpnew_fma_multi.sv @@ -745,8 +745,10 @@ module fpnew_fma_multi #( if (FpFmtConfig[fmt]) begin : active_format always_comb begin : post_process - // detect of / uf - fmt_uf_after_round[fmt] = rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '0; // denormal + // detect of / uf + fmt_uf_after_round[fmt] = (rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '0) // denormal + || ((pre_round_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '0) && (rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == 1) && + ((round_sticky_bits != 2'b11) || (!sum_sticky_bits[MAN_BITS*2 + 4] && ((rnd_mode_i == fpnew_pkg::RNE) || (rnd_mode_i == fpnew_pkg::RMM))))); fmt_of_after_round[fmt] = rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '1; // inf exp. // Assemble regular result, nan box short ones. diff --git a/rtl/vendor/pulp_platform_fpnew/vendor/opene906/E906_RTL_FACTORY/gen_rtl/fdsu/rtl/pa_fdsu_pack_single.v b/rtl/vendor/pulp_platform_fpnew/vendor/opene906/E906_RTL_FACTORY/gen_rtl/fdsu/rtl/pa_fdsu_pack_single.v index 87139a253..d22e85ba9 100644 --- a/rtl/vendor/pulp_platform_fpnew/vendor/opene906/E906_RTL_FACTORY/gen_rtl/fdsu/rtl/pa_fdsu_pack_single.v +++ b/rtl/vendor/pulp_platform_fpnew/vendor/opene906/E906_RTL_FACTORY/gen_rtl/fdsu/rtl/pa_fdsu_pack_single.v @@ -222,7 +222,7 @@ end assign ex4_rst_norm[31:0] = {fdsu_ex4_result_sign, ex4_expnt_rst[7:0], ex4_frac_23[22:0]}; -assign ex4_cor_uf = (fdsu_ex4_uf && !ex4_denorm_potnt_norm || ex4_uf_plus) +assign ex4_cor_uf = (fdsu_ex4_uf || ex4_denorm_potnt_norm || ex4_uf_plus) && fdsu_ex4_nx; assign ex4_cor_nx = fdsu_ex4_nx || fdsu_ex4_of