From 7e938293e05f945eddbad8806d75c84311414a3e Mon Sep 17 00:00:00 2001 From: Cesar Fuguet Date: Tue, 21 Mar 2023 16:30:46 +0100 Subject: [PATCH] Support multiple outstanding load operations to the Dcache The ID in the request from the load/store unit must be mirrored by the Dcache in the response. This allows to match a given response to its corresponding request. Responses can be given (by the Dcache) in a different order that the one of requests. This modification introduces a pending load table that tracks outstanding load operations to the Dcache. The depth of this table is a parameter in the target configuration package. Signed-off-by: Cesar Fuguet --- core/cache_subsystem/cache_ctrl.sv | 5 +- core/cache_subsystem/wt_dcache_ctrl.sv | 6 +- core/include/ariane_pkg.sv | 3 + core/include/cv32a60x_config_pkg.sv | 1 + core/include/cv32a6_embedded_config_pkg.sv | 1 + .../cv32a6_ima_sv32_fpga_config_pkg.sv | 1 + core/include/cv32a6_imac_sv0_config_pkg.sv | 1 + core/include/cv32a6_imac_sv32_config_pkg.sv | 1 + core/include/cv32a6_imafc_sv32_config_pkg.sv | 1 + .../cv64a6_imadfcv_sv39_polara_config_pkg.sv | 1 + core/include/cv64a6_imafdc_sv39_config_pkg.sv | 1 + ...cv64a6_imafdc_sv39_openpiton_config_pkg.sv | 1 + .../include/cv64a6_imafdcv_sv39_config_pkg.sv | 1 + core/load_unit.sv | 255 ++++++++++++------ util/config_pkg_generator.py | 3 + 15 files changed, 195 insertions(+), 87 deletions(-) diff --git a/core/cache_subsystem/cache_ctrl.sv b/core/cache_subsystem/cache_ctrl.sv index 39bc67d101e..c608a41d1b4 100644 --- a/core/cache_subsystem/cache_ctrl.sv +++ b/core/cache_subsystem/cache_ctrl.sv @@ -74,6 +74,7 @@ module cache_ctrl import ariane_pkg::*; import std_cache_pkg::*; #( typedef struct packed { logic [DCACHE_INDEX_WIDTH-1:0] index; logic [DCACHE_TAG_WIDTH-1:0] tag; + logic [DCACHE_TID_WIDTH-1:0] id; logic [7:0] be; logic [1:0] size; logic we; @@ -116,7 +117,7 @@ module cache_ctrl import ariane_pkg::*; import std_cache_pkg::*; #( req_port_o.data_gnt = 1'b0; req_port_o.data_rvalid = 1'b0; req_port_o.data_rdata = '0; - req_port_o.data_rid = '0; + req_port_o.data_rid = mem_req_q.id; miss_req_o = '0; mshr_addr_o = '0; // Memory array communication @@ -138,6 +139,7 @@ module cache_ctrl import ariane_pkg::*; import std_cache_pkg::*; #( // save index, be and we mem_req_d.index = req_port_i.address_index; + mem_req_d.id = req_port_i.data_id; mem_req_d.be = req_port_i.data_be; mem_req_d.size = req_port_i.data_size; mem_req_d.we = req_port_i.data_we; @@ -186,6 +188,7 @@ module cache_ctrl import ariane_pkg::*; import std_cache_pkg::*; #( if (req_port_i.data_req && !mem_req_q.we && !flush_i) begin state_d = WAIT_TAG; // switch back to WAIT_TAG mem_req_d.index = req_port_i.address_index; + mem_req_d.id = req_port_i.data_id; mem_req_d.be = req_port_i.data_be; mem_req_d.size = req_port_i.data_size; mem_req_d.we = req_port_i.data_we; diff --git a/core/cache_subsystem/wt_dcache_ctrl.sv b/core/cache_subsystem/wt_dcache_ctrl.sv index a6a14cd1b6c..c7fc3d25d50 100644 --- a/core/cache_subsystem/wt_dcache_ctrl.sv +++ b/core/cache_subsystem/wt_dcache_ctrl.sv @@ -59,6 +59,7 @@ module wt_dcache_ctrl import ariane_pkg::*; import wt_cache_pkg::*; #( logic [DCACHE_TAG_WIDTH-1:0] address_tag_d, address_tag_q; logic [DCACHE_CL_IDX_WIDTH-1:0] address_idx_d, address_idx_q; logic [DCACHE_OFFSET_WIDTH-1:0] address_off_d, address_off_q; + logic [DCACHE_TID_WIDTH-1:0] id_d, id_q; logic [DCACHE_SET_ASSOC-1:0] vld_data_d, vld_data_q; logic save_tag, rd_req_d, rd_req_q, rd_ack_d, rd_ack_q; logic [1:0] data_size_d, data_size_q; @@ -72,6 +73,7 @@ module wt_dcache_ctrl import ariane_pkg::*; import wt_cache_pkg::*; #( assign address_tag_d = (save_tag) ? req_port_i.address_tag : address_tag_q; assign address_idx_d = (req_port_o.data_gnt) ? req_port_i.address_index[DCACHE_INDEX_WIDTH-1:DCACHE_OFFSET_WIDTH] : address_idx_q; assign address_off_d = (req_port_o.data_gnt) ? req_port_i.address_index[DCACHE_OFFSET_WIDTH-1:0] : address_off_q; + assign id_d = (req_port_o.data_gnt) ? req_port_i.data_id : id_q; assign data_size_d = (req_port_o.data_gnt) ? req_port_i.data_size : data_size_q; assign rd_tag_o = address_tag_d; assign rd_idx_o = address_idx_d; @@ -79,7 +81,7 @@ module wt_dcache_ctrl import ariane_pkg::*; import wt_cache_pkg::*; #( assign req_port_o.data_rdata = rd_data_i; assign req_port_o.data_ruser = rd_user_i; - assign req_port_o.data_rid = '0; + assign req_port_o.data_rid = id_q; // to miss unit assign miss_vld_bits_o = vld_data_q; @@ -240,6 +242,7 @@ module wt_dcache_ctrl import ariane_pkg::*; import wt_cache_pkg::*; #( address_tag_q <= '0; address_idx_q <= '0; address_off_q <= '0; + id_q <= '0; vld_data_q <= '0; data_size_q <= '0; rd_req_q <= '0; @@ -249,6 +252,7 @@ module wt_dcache_ctrl import ariane_pkg::*; import wt_cache_pkg::*; #( address_tag_q <= address_tag_d; address_idx_q <= address_idx_d; address_off_q <= address_off_d; + id_q <= id_d; vld_data_q <= vld_data_d; data_size_q <= data_size_d; rd_req_q <= rd_req_d; diff --git a/core/include/ariane_pkg.sv b/core/include/ariane_pkg.sv index d61d98cb6a3..00e95daed01 100644 --- a/core/include/ariane_pkg.sv +++ b/core/include/ariane_pkg.sv @@ -667,6 +667,9 @@ package ariane_pkg; logic vfp; // is this a vector floating-point instruction? } scoreboard_entry_t; + // Maximum number of inflight memory load requests + localparam int unsigned NR_LOAD_BUFFER_ENTRIES = cva6_config_pkg::CVA6ConfigNrLoadBufEntries; + // --------------- // MMU instanciation // --------------- diff --git a/core/include/cv32a60x_config_pkg.sv b/core/include/cv32a60x_config_pkg.sv index 32645f90b73..bbd7f576601 100644 --- a/core/include/cv32a60x_config_pkg.sv +++ b/core/include/cv32a60x_config_pkg.sv @@ -58,6 +58,7 @@ package cva6_config_pkg; localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; + localparam CVA6ConfigNrLoadBufEntries = 2; localparam CVA6ConfigInstrTlbEntries = 2; localparam CVA6ConfigDataTlbEntries = 2; diff --git a/core/include/cv32a6_embedded_config_pkg.sv b/core/include/cv32a6_embedded_config_pkg.sv index d5066a1ad15..749241e7deb 100644 --- a/core/include/cv32a6_embedded_config_pkg.sv +++ b/core/include/cv32a6_embedded_config_pkg.sv @@ -57,6 +57,7 @@ package cva6_config_pkg; localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; + localparam CVA6ConfigNrLoadBufEntries = 2; localparam CVA6ConfigInstrTlbEntries = 2; localparam CVA6ConfigDataTlbEntries = 2; diff --git a/core/include/cv32a6_ima_sv32_fpga_config_pkg.sv b/core/include/cv32a6_ima_sv32_fpga_config_pkg.sv index 7e85052e88b..ed91ecd7d8e 100644 --- a/core/include/cv32a6_ima_sv32_fpga_config_pkg.sv +++ b/core/include/cv32a6_ima_sv32_fpga_config_pkg.sv @@ -58,6 +58,7 @@ package cva6_config_pkg; localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; + localparam CVA6ConfigNrLoadBufEntries = 2; localparam CVA6ConfigInstrTlbEntries = 2; localparam CVA6ConfigDataTlbEntries = 2; diff --git a/core/include/cv32a6_imac_sv0_config_pkg.sv b/core/include/cv32a6_imac_sv0_config_pkg.sv index 7da2cc30111..e3c3370e61e 100644 --- a/core/include/cv32a6_imac_sv0_config_pkg.sv +++ b/core/include/cv32a6_imac_sv0_config_pkg.sv @@ -53,6 +53,7 @@ package cva6_config_pkg; localparam CVA6ConfigNrCommitPorts = 2; localparam CVA6ConfigNrScoreboardEntries = 8; + localparam CVA6ConfigNrLoadBufEntries = 2; localparam CVA6ConfigFPGAEn = 0; diff --git a/core/include/cv32a6_imac_sv32_config_pkg.sv b/core/include/cv32a6_imac_sv32_config_pkg.sv index 92a2a17dd99..55715b92580 100644 --- a/core/include/cv32a6_imac_sv32_config_pkg.sv +++ b/core/include/cv32a6_imac_sv32_config_pkg.sv @@ -58,6 +58,7 @@ package cva6_config_pkg; localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; + localparam CVA6ConfigNrLoadBufEntries = 2; localparam CVA6ConfigInstrTlbEntries = 2; localparam CVA6ConfigDataTlbEntries = 2; diff --git a/core/include/cv32a6_imafc_sv32_config_pkg.sv b/core/include/cv32a6_imafc_sv32_config_pkg.sv index c9426ff0bb2..3808b176643 100644 --- a/core/include/cv32a6_imafc_sv32_config_pkg.sv +++ b/core/include/cv32a6_imafc_sv32_config_pkg.sv @@ -58,6 +58,7 @@ package cva6_config_pkg; localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; + localparam CVA6ConfigNrLoadBufEntries = 2; localparam CVA6ConfigInstrTlbEntries = 2; localparam CVA6ConfigDataTlbEntries = 2; diff --git a/core/include/cv64a6_imadfcv_sv39_polara_config_pkg.sv b/core/include/cv64a6_imadfcv_sv39_polara_config_pkg.sv index e2237bd59db..ddd05423381 100644 --- a/core/include/cv64a6_imadfcv_sv39_polara_config_pkg.sv +++ b/core/include/cv64a6_imadfcv_sv39_polara_config_pkg.sv @@ -58,6 +58,7 @@ package cva6_config_pkg; localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; + localparam CVA6ConfigNrLoadBufEntries = 2; localparam CVA6ConfigInstrTlbEntries = 16; localparam CVA6ConfigDataTlbEntries = 16; diff --git a/core/include/cv64a6_imafdc_sv39_config_pkg.sv b/core/include/cv64a6_imafdc_sv39_config_pkg.sv index 2c31bd0e384..c0426417ef5 100644 --- a/core/include/cv64a6_imafdc_sv39_config_pkg.sv +++ b/core/include/cv64a6_imafdc_sv39_config_pkg.sv @@ -58,6 +58,7 @@ package cva6_config_pkg; localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; + localparam CVA6ConfigNrLoadBufEntries = 2; localparam CVA6ConfigInstrTlbEntries = 16; localparam CVA6ConfigDataTlbEntries = 16; diff --git a/core/include/cv64a6_imafdc_sv39_openpiton_config_pkg.sv b/core/include/cv64a6_imafdc_sv39_openpiton_config_pkg.sv index c3abdca58ea..2f760abca0f 100644 --- a/core/include/cv64a6_imafdc_sv39_openpiton_config_pkg.sv +++ b/core/include/cv64a6_imafdc_sv39_openpiton_config_pkg.sv @@ -58,6 +58,7 @@ package cva6_config_pkg; localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; + localparam CVA6ConfigNrLoadBufEntries = 2; localparam CVA6ConfigInstrTlbEntries = 16; localparam CVA6ConfigDataTlbEntries = 16; diff --git a/core/include/cv64a6_imafdcv_sv39_config_pkg.sv b/core/include/cv64a6_imafdcv_sv39_config_pkg.sv index e89904b5464..e0bc5cbbda4 100644 --- a/core/include/cv64a6_imafdcv_sv39_config_pkg.sv +++ b/core/include/cv64a6_imafdcv_sv39_config_pkg.sv @@ -58,6 +58,7 @@ package cva6_config_pkg; localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; + localparam CVA6ConfigNrLoadBufEntries = 2; localparam CVA6ConfigInstrTlbEntries = 16; localparam CVA6ConfigDataTlbEntries = 16; diff --git a/core/load_unit.sv b/core/load_unit.sv index 6ad02fd05cf..73d646c5b73 100644 --- a/core/load_unit.sv +++ b/core/load_unit.sv @@ -12,6 +12,11 @@ // Michael Schaffner , ETH Zurich // Date: 15.08.2018 // Description: Load Unit, takes care of all load requests +// +// Contributor: Cesar Fuguet , CEA List +// Date: August 29, 2023 +// Modification: add support for multiple outstanding load operations +// to the data cache module load_unit import ariane_pkg::*; #( parameter config_pkg::cva6_cfg_t CVA6Cfg = config_pkg::cva6_cfg_empty, @@ -50,13 +55,98 @@ module load_unit import ariane_pkg::*; #( ABORT_TRANSACTION, ABORT_TRANSACTION_NI, WAIT_TRANSLATION, WAIT_FLUSH, WAIT_WB_EMPTY } state_d, state_q; - // in order to decouple the response interface from the request interface we need a - // a queue which can hold all outstanding memory requests - struct packed { - logic [TRANS_ID_BITS-1:0] trans_id; - logic [riscv::XLEN_ALIGN_BYTES-1:0] address_offset; - fu_op operation; - } load_data_d, load_data_q, in_data; + + // in order to decouple the response interface from the request interface, + // we need a a buffer which can hold all inflight memory load requests + typedef struct packed { + logic [TRANS_ID_BITS-1:0] trans_id; // scoreboard identifier + logic [riscv::XLEN_ALIGN_BYTES-1:0] address_offset; // least significant bits of the address + fu_op operation; // type of load + } ldbuf_t; + + + // to support a throughput of one load per cycle, if the number of entries + // of the load buffer is 1, implement a fall-through mode. This however + // adds a combinational path between the request and response interfaces + // towards the cache. + localparam logic LDBUF_FALLTHROUGH = (NR_LOAD_BUFFER_ENTRIES == 1); + + localparam int unsigned REQ_ID_BITS = NR_LOAD_BUFFER_ENTRIES > 1 ? $clog2(NR_LOAD_BUFFER_ENTRIES) : 1; + typedef logic [REQ_ID_BITS-1:0] ldbuf_id_t; + + logic [NR_LOAD_BUFFER_ENTRIES-1:0] ldbuf_valid_q, ldbuf_valid_d; + logic [NR_LOAD_BUFFER_ENTRIES-1:0] ldbuf_flushed_q, ldbuf_flushed_d; + ldbuf_t [NR_LOAD_BUFFER_ENTRIES-1:0] ldbuf_q; + logic ldbuf_empty, ldbuf_full; + ldbuf_id_t ldbuf_free_index; + logic ldbuf_w; + ldbuf_t ldbuf_wdata; + ldbuf_id_t ldbuf_windex; + logic ldbuf_r; + ldbuf_t ldbuf_rdata; + ldbuf_id_t ldbuf_rindex; + ldbuf_id_t ldbuf_last_id_q; + + assign ldbuf_full = &ldbuf_valid_q; + + // + // buffer of outstanding loads + + // write in the first available slot + generate + if (NR_LOAD_BUFFER_ENTRIES > 1) begin : ldbuf_free_index_multi_gen + lzc #( + .WIDTH (NR_LOAD_BUFFER_ENTRIES), + .MODE (1'b0) // Count leading zeros + ) lzc_windex_i ( + .in_i (~ldbuf_valid_q), + .cnt_o (ldbuf_free_index), + .empty_o (ldbuf_empty) + ); + end else begin : ldbuf_free_index_single_gen + assign ldbuf_free_index = 1'b0; + end + endgenerate + + assign ldbuf_windex = (LDBUF_FALLTHROUGH && ldbuf_r) ? ldbuf_rindex : ldbuf_free_index; + + always_comb + begin : ldbuf_comb + ldbuf_flushed_d = ldbuf_flushed_q; + ldbuf_valid_d = ldbuf_valid_q; + + // In case of flush, raise the flushed flag in all slots. + if (flush_i) begin + ldbuf_flushed_d = '1; + end + // Free read entry (in the case of fall-through mode, free the entry + // only if there is no pending load) + if (ldbuf_r && (!LDBUF_FALLTHROUGH || !ldbuf_w)) begin + ldbuf_valid_d[ldbuf_rindex] = 1'b0; + end + // Track a new outstanding operation in the load buffer + if (ldbuf_w) begin + ldbuf_flushed_d[ldbuf_windex] = 1'b0; + ldbuf_valid_d[ldbuf_windex] = 1'b1; + end + end + + always_ff @(posedge clk_i or negedge rst_ni) + begin : ldbuf_ff + if (!rst_ni) begin + ldbuf_flushed_q <= '0; + ldbuf_valid_q <= '0; + ldbuf_last_id_q <= '0; + ldbuf_q <= '0; + end else begin + ldbuf_flushed_q <= ldbuf_flushed_d; + ldbuf_valid_q <= ldbuf_valid_d; + if (ldbuf_w) begin + ldbuf_last_id_q <= ldbuf_windex; + ldbuf_q[ldbuf_windex] <= ldbuf_wdata; + end + end + end // page offset is defined as the lower 12 bits, feed through for address checker assign page_offset_o = lsu_ctrl_i.vaddr[11:0]; @@ -65,8 +155,8 @@ module load_unit import ariane_pkg::*; #( // this is a read-only interface so set the write enable to 0 assign req_port_o.data_we = 1'b0; assign req_port_o.data_wdata = '0; - // compose the queue data, control is handled in the FSM - assign in_data = {lsu_ctrl_i.trans_id, lsu_ctrl_i.vaddr[riscv::XLEN_ALIGN_BYTES-1:0], lsu_ctrl_i.operation}; + // compose the load buffer write data, control is handled in the FSM + assign ldbuf_wdata = {lsu_ctrl_i.trans_id, lsu_ctrl_i.vaddr[riscv::XLEN_ALIGN_BYTES-1:0], lsu_ctrl_i.operation}; // output address // we can now output the lower 12 bit as the index to the cache assign req_port_o.address_index = lsu_ctrl_i.vaddr[ariane_pkg::DCACHE_INDEX_WIDTH-1:0]; @@ -74,8 +164,8 @@ module load_unit import ariane_pkg::*; #( assign req_port_o.address_tag = paddr_i[ariane_pkg::DCACHE_TAG_WIDTH + ariane_pkg::DCACHE_INDEX_WIDTH-1 : ariane_pkg::DCACHE_INDEX_WIDTH]; - // we only issue one single request at a time - assign req_port_o.data_id = '0; + // request id = index of the load buffer's entry + assign req_port_o.data_id = ldbuf_windex; // directly forward exception fields (valid bit is set below) assign ex_o.cause = ex_i.cause; assign ex_o.tval = ex_i.tval; @@ -94,9 +184,10 @@ module load_unit import ariane_pkg::*; #( // Load Control // --------------- always_comb begin : load_control + automatic logic accept_req; + // default assignments state_d = state_q; - load_data_d = load_data_q; translation_req_o = 1'b0; req_port_o.data_req = 1'b0; // tag control @@ -106,10 +197,14 @@ module load_unit import ariane_pkg::*; #( req_port_o.data_size = extract_transfer_size(lsu_ctrl_i.operation); pop_ld_o = 1'b0; + // In IDLE and SEND_TAG states, this unit can accept a new load request + // when the load buffer is not full or if there is a response and the + // load buffer is in fall-through mode + accept_req = (valid_i && (!ldbuf_full || (LDBUF_FALLTHROUGH && ldbuf_r))); + case (state_q) IDLE: begin - // we've got a new load request - if (valid_i) begin + if (accept_req) begin // start the translation process even though we do not know if the addresses match // this should ease timing translation_req_o = 1'b1; @@ -168,6 +263,14 @@ module load_unit import ariane_pkg::*; #( // we've got a hit and we can continue with the request process if (dtlb_hit_i) state_d = WAIT_GNT; + + // we got an exception + if (ex_i.valid) begin + // the next state will be the idle state + state_d = IDLE; + // pop load - but only if we are not getting an rvalid in here - otherwise we will over-write an incoming transaction + pop_ld_o = ~req_port_i.data_rvalid; + end end WAIT_GNT: begin @@ -195,8 +298,8 @@ module load_unit import ariane_pkg::*; #( SEND_TAG: begin req_port_o.tag_valid = 1'b1; state_d = IDLE; - // we can make a new request here if we got one - if (valid_i) begin + + if (accept_req) begin // start the translation process even though we do not know if the addresses match // this should ease timing translation_req_o = 1'b1; @@ -246,39 +349,33 @@ module load_unit import ariane_pkg::*; #( default: state_d = IDLE; endcase - // we got an exception - if (ex_i.valid && valid_i) begin - // the next state will be the idle state - state_d = IDLE; - // pop load - but only if we are not getting an rvalid in here - otherwise we will over-write an incoming transaction - if (!req_port_i.data_rvalid) - pop_ld_o = 1'b1; - end - - // save the load data for later usage -> we should not clutter the load_data register - if (pop_ld_o && !ex_i.valid) begin - load_data_d = in_data; - end - // if we just flushed and the queue is not empty or we are getting an rvalid this cycle wait in a extra stage if (flush_i) begin state_d = WAIT_FLUSH; end end + // track the load data for later usage + assign ldbuf_w = req_port_o.data_req & req_port_i.data_gnt; + // --------------- // Retire Load // --------------- + assign ldbuf_rindex = (NR_LOAD_BUFFER_ENTRIES > 1) ? ldbuf_id_t'(req_port_i.data_rid) : 1'b0, + ldbuf_rdata = ldbuf_q[ldbuf_rindex]; + // decoupled rvalid process always_comb begin : rvalid_output + // read the pending load buffer + ldbuf_r = req_port_i.data_rvalid; + trans_id_o = ldbuf_q[ldbuf_rindex].trans_id; valid_o = 1'b0; ex_o.valid = 1'b0; - // output the queue data directly, the valid signal is set corresponding to the process above - trans_id_o = load_data_q.trans_id; - // we got an rvalid and are currently not flushing and not aborting the request - if (req_port_i.data_rvalid && state_q != WAIT_FLUSH) begin - // we killed the request - if(!req_port_o.kill_req) + + // we got an rvalid and it's corresponding request was not flushed + if (req_port_i.data_rvalid && !ldbuf_flushed_q[ldbuf_rindex]) begin + // if the response corresponds to the last request, check that we are not killing it + if((ldbuf_last_id_q != ldbuf_rindex) || !req_port_o.kill_req) valid_o = 1'b1; // the output is also valid if we got an exception. An exception arrives one cycle after // dtlb_hit_i is asserted, i.e. when we are in SEND_TAG. Otherwise, the exception @@ -288,18 +385,15 @@ module load_unit import ariane_pkg::*; #( ex_o.valid = 1'b1; end end - // an exception occurred during translation (we need to check for the valid flag because we could also get an - // exception from the store unit) + + // an exception occurred during translation // exceptions can retire out-of-order -> but we need to give priority to non-excepting load and stores // so we simply check if we got an rvalid if so we prioritize it by not retiring the exception - we simply go for another // round in the load FSM - if (valid_i && ex_i.valid && !req_port_i.data_rvalid) begin - valid_o = 1'b1; - ex_o.valid = 1'b1; + if ((state_q == WAIT_TRANSLATION) && !req_port_i.data_rvalid && ex_i.valid && valid_i) begin trans_id_o = lsu_ctrl_i.trans_id; - // if we are waiting for the translation to finish do not give a valid signal yet - end else if (state_q == WAIT_TRANSLATION) begin - valid_o = 1'b0; + valid_o = 1'b1; + ex_o.valid = 1'b1; end end @@ -307,11 +401,9 @@ module load_unit import ariane_pkg::*; #( // latch physical address for the tag cycle (one cycle after applying the index) always_ff @(posedge clk_i or negedge rst_ni) begin if (~rst_ni) begin - state_q <= IDLE; - load_data_q <= '0; + state_q <= IDLE; end else begin - state_q <= state_d; - load_data_q <= load_data_d; + state_q <= state_d; end end @@ -321,12 +413,12 @@ module load_unit import ariane_pkg::*; #( riscv::xlen_t shifted_data; // realign as needed - assign shifted_data = req_port_i.data_rdata >> {load_data_q.address_offset, 3'b000}; + assign shifted_data = req_port_i.data_rdata >> {ldbuf_rdata.address_offset, 3'b000}; /* // result mux (leaner code, but more logic stages. // can be used instead of the code below (in between //result mux fast) if timing is not so critical) always_comb begin - unique case (load_data_q.operation) + unique case (ldbuf_rdata.operation) LWU: result_o = shifted_data[31:0]; LHU: result_o = shifted_data[15:0]; LBU: result_o = shifted_data[7:0]; @@ -338,64 +430,57 @@ module load_unit import ariane_pkg::*; #( end */ // result mux fast - logic [(riscv::XLEN/8)-1:0] sign_bits; - logic [riscv::XLEN_ALIGN_BYTES-1:0] idx_d, idx_q; - logic sign_bit, signed_d, signed_q, fp_sign_d, fp_sign_q; + logic [(riscv::XLEN/8)-1:0] rdata_sign_bits; + logic [riscv::XLEN_ALIGN_BYTES-1:0] rdata_offset; + logic rdata_sign_bit, rdata_is_signed, rdata_is_fp_signed; // prepare these signals for faster selection in the next cycle - assign signed_d = load_data_d.operation inside {ariane_pkg::LW, ariane_pkg::LH, ariane_pkg::LB}; - assign fp_sign_d = load_data_d.operation inside {ariane_pkg::FLW, ariane_pkg::FLH, ariane_pkg::FLB}; - - assign idx_d = ((load_data_d.operation inside {ariane_pkg::LW, ariane_pkg::FLW}) & riscv::IS_XLEN64) ? load_data_d.address_offset + 3 : - (load_data_d.operation inside {ariane_pkg::LH, ariane_pkg::FLH}) ? load_data_d.address_offset + 1 : - load_data_d.address_offset; - + assign rdata_is_signed = ldbuf_rdata.operation inside {ariane_pkg::LW, ariane_pkg::LH, ariane_pkg::LB}; + assign rdata_is_fp_signed = ldbuf_rdata.operation inside {ariane_pkg::FLW, ariane_pkg::FLH, ariane_pkg::FLB}; + assign rdata_offset = ((ldbuf_rdata.operation inside {ariane_pkg::LW, ariane_pkg::FLW}) & riscv::IS_XLEN64) ? ldbuf_rdata.address_offset + 3 : + ( ldbuf_rdata.operation inside {ariane_pkg::LH, ariane_pkg::FLH}) ? ldbuf_rdata.address_offset + 1 : + ldbuf_rdata.address_offset; for (genvar i = 0; i < (riscv::XLEN/8); i++) begin : gen_sign_bits - assign sign_bits[i] = req_port_i.data_rdata[(i+1)*8-1]; + assign rdata_sign_bits[i] = req_port_i.data_rdata[(i+1)*8-1]; end // select correct sign bit in parallel to result shifter above // pull to 0 if unsigned - assign sign_bit = signed_q & sign_bits[idx_q] | fp_sign_q; + assign rdata_sign_bit = rdata_is_signed & rdata_sign_bits[rdata_offset] | rdata_is_fp_signed; // result mux always_comb begin - unique case (load_data_q.operation) - ariane_pkg::LW, ariane_pkg::LWU, ariane_pkg::FLW: result_o = {{riscv::XLEN-32{sign_bit}}, shifted_data[31:0]}; - ariane_pkg::LH, ariane_pkg::LHU, ariane_pkg::FLH: result_o = {{riscv::XLEN-32+16{sign_bit}}, shifted_data[15:0]}; - ariane_pkg::LB, ariane_pkg::LBU, ariane_pkg::FLB: result_o = {{riscv::XLEN-32+24{sign_bit}}, shifted_data[7:0]}; - default: result_o = shifted_data[riscv::XLEN-1:0]; + unique case (ldbuf_rdata.operation) + ariane_pkg::LW, ariane_pkg::LWU, ariane_pkg::FLW: result_o = {{riscv::XLEN-32{rdata_sign_bit}}, shifted_data[31:0]}; + ariane_pkg::LH, ariane_pkg::LHU, ariane_pkg::FLH: result_o = {{riscv::XLEN-32+16{rdata_sign_bit}}, shifted_data[15:0]}; + ariane_pkg::LB, ariane_pkg::LBU, ariane_pkg::FLB: result_o = {{riscv::XLEN-32+24{rdata_sign_bit}}, shifted_data[7:0]}; + default: result_o = shifted_data[riscv::XLEN-1:0]; endcase end - - always_ff @(posedge clk_i or negedge rst_ni) begin : p_regs - if (~rst_ni) begin - idx_q <= 0; - signed_q <= 0; - fp_sign_q <= 0; - end else begin - idx_q <= idx_d; - signed_q <= signed_d; - fp_sign_q <= fp_sign_d; - end - end // end result mux fast /////////////////////////////////////////////////////// // assertions /////////////////////////////////////////////////////// - //pragma translate_off - // check invalid offsets +//pragma translate_off +`ifndef VERILATOR + initial assert (ariane_pkg::DCACHE_TID_WIDTH >= REQ_ID_BITS) else + $fatal("CVA6ConfigDcacheIdWidth parameter is not wide enough to encode pending loads"); + // check invalid offsets, but only issue a warning as these conditions actually trigger a load address misaligned exception addr_offset0: assert property (@(posedge clk_i) disable iff (~rst_ni) - valid_o |-> (load_data_q.operation inside {ariane_pkg::LW, ariane_pkg::LWU}) |-> load_data_q.address_offset < 5) else $fatal (1,"invalid address offset used with {LW, LWU}"); + ldbuf_w |-> (ldbuf_wdata.operation inside {ariane_pkg::LW, ariane_pkg::LWU}) |-> ldbuf_wdata.address_offset < 5) else + $fatal(1, "invalid address offset used with {LW, LWU}"); addr_offset1: assert property (@(posedge clk_i) disable iff (~rst_ni) - valid_o |-> (load_data_q.operation inside {ariane_pkg::LH, ariane_pkg::LHU}) |-> load_data_q.address_offset < 7) else $fatal (1,"invalid address offset used with {LH, LHU}"); + ldbuf_w |-> (ldbuf_wdata.operation inside {ariane_pkg::LH, ariane_pkg::LHU}) |-> ldbuf_wdata.address_offset < 7) else + $fatal(1, "invalid address offset used with {LH, LHU}"); addr_offset2: assert property (@(posedge clk_i) disable iff (~rst_ni) - valid_o |-> (load_data_q.operation inside {ariane_pkg::LB, ariane_pkg::LBU}) |-> load_data_q.address_offset < 8) else $fatal (1,"invalid address offset used with {LB, LBU}"); - //pragma translate_on + ldbuf_w |-> (ldbuf_wdata.operation inside {ariane_pkg::LB, ariane_pkg::LBU}) |-> ldbuf_wdata.address_offset < 8) else + $fatal(1, "invalid address offset used with {LB, LBU}"); +`endif +//pragma translate_on endmodule diff --git a/util/config_pkg_generator.py b/util/config_pkg_generator.py index 82fdc663e2a..d801e71622f 100644 --- a/util/config_pkg_generator.py +++ b/util/config_pkg_generator.py @@ -85,6 +85,8 @@ def setup_parser_config_generator(): help="Load latency") parser.add_argument("--NrStorePipeRegs", type=int, default=None, help="Store latency") + parser.add_argument("--NrLoadBufEntries", type=int, default=None, + help="Number of entries in the load buffer") parser.add_argument("--InstrTlbEntries", type=int, default=None, help="Number of instruction TLB entries") parser.add_argument("--DataTlbEntries", type=int, default=None, @@ -145,6 +147,7 @@ def setup_parser_config_generator(): "FPGAEn" : "CVA6ConfigFPGAEn", "NrLoadPipeRegs" : "CVA6ConfigNrLoadPipeRegs", "NrStorePipeRegs" : "CVA6ConfigNrStorePipeRegs", + "NrLoadBufEntries" : "CVA6ConfigNrLoadBufEntries", "InstrTlbEntries" : "CVA6ConfigInstrTlbEntries", "DataTlbEntries" : "CVA6ConfigDataTlbEntries", "RASDepth": "CVA6ConfigRASDepth",