diff --git a/Makefile b/Makefile index ba2ac03..588cf0f 100644 --- a/Makefile +++ b/Makefile @@ -233,7 +233,6 @@ VLOG_DEFS = -DCACHEPOOL # Cluster configuration VLOG_DEFS += -DNUM_TILES=$(num_tiles) -VLOG_DEFS += -DNumRemotePortTile=$(num_remote_ports_per_tile) VLOG_DEFS += -DNUM_CORES=$(num_cores) VLOG_DEFS += -DDATA_WIDTH=$(data_width) VLOG_DEFS += -DADDR_WIDTH=$(addr_width) @@ -259,6 +258,7 @@ VLOG_DEFS += -DSPATZ_NUM_FPU=$(spatz_num_fpu) VLOG_DEFS += -DSPATZ_NUM_IPU=$(spatz_num_ipu) VLOG_DEFS += -DSPATZ_MAX_TRANS=$(spatz_max_trans) VLOG_DEFS += -DSNITCH_MAX_TRANS=$(snitch_max_trans) +VLOG_DEFS += -DREMOTE_PORT_PER_CORE=$(num_remote_ports_per_tile) # AXI configuration VLOG_DEFS += -DAXI_USER_WIDTH=$(axi_user_width) diff --git a/config/cachepool_fpu_512.mk b/config/cachepool_fpu_512.mk index 46d471a..2e4c3ca 100644 --- a/config/cachepool_fpu_512.mk +++ b/config/cachepool_fpu_512.mk @@ -20,7 +20,7 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 -num_remote_ports_per_tile ?= 1 +num_remote_ports_per_tile ?= 2 ###################### diff --git a/hardware/cachepool_peripheral/cachepool_peripheral.sv b/hardware/cachepool_peripheral/cachepool_peripheral.sv index d54a0cf..6326cfa 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral.sv +++ b/hardware/cachepool_peripheral/cachepool_peripheral.sv @@ -30,7 +30,7 @@ module cachepool_peripheral input reg_req_t reg_req_i, output reg_rsp_t reg_rsp_o, - output logic eoc_o, + output logic [3:0] eoc_o, input addr_t tcdm_start_address_i, input addr_t tcdm_end_address_i, output addr_t private_start_addr_o, diff --git a/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson b/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson index c735561..79d7cda 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson +++ b/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson @@ -139,7 +139,7 @@ hwaccess: "hro", resval: "0", fields: [{ - bits: "0:0", + bits: "3:0", name: "EOC_EXIT", desc: "Indicates the end of computation and exit status." }] diff --git a/hardware/cachepool_peripheral/cachepool_peripheral_reg_pkg.sv b/hardware/cachepool_peripheral/cachepool_peripheral_reg_pkg.sv index 0ebaace..df8c6e6 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral_reg_pkg.sv +++ b/hardware/cachepool_peripheral/cachepool_peripheral_reg_pkg.sv @@ -51,7 +51,7 @@ package cachepool_peripheral_reg_pkg; } cachepool_peripheral_reg2hw_cluster_boot_control_reg_t; typedef struct packed { - logic q; + logic [3:0] q; } cachepool_peripheral_reg2hw_cluster_eoc_exit_reg_t; typedef struct packed { @@ -115,15 +115,15 @@ package cachepool_peripheral_reg_pkg; // Register -> HW type typedef struct packed { - cachepool_peripheral_reg2hw_hart_select_mreg_t [1:0] hart_select; // [272:253] - cachepool_peripheral_reg2hw_cl_clint_set_reg_t cl_clint_set; // [252:220] - cachepool_peripheral_reg2hw_cl_clint_clear_reg_t cl_clint_clear; // [219:187] - cachepool_peripheral_reg2hw_hw_barrier_reg_t hw_barrier; // [186:155] - cachepool_peripheral_reg2hw_icache_prefetch_enable_reg_t icache_prefetch_enable; // [154:154] - cachepool_peripheral_reg2hw_spatz_status_reg_t spatz_status; // [153:153] - cachepool_peripheral_reg2hw_spatz_cycle_reg_t spatz_cycle; // [152:121] - cachepool_peripheral_reg2hw_cluster_boot_control_reg_t cluster_boot_control; // [120:89] - cachepool_peripheral_reg2hw_cluster_eoc_exit_reg_t cluster_eoc_exit; // [88:88] + cachepool_peripheral_reg2hw_hart_select_mreg_t [1:0] hart_select; // [275:256] + cachepool_peripheral_reg2hw_cl_clint_set_reg_t cl_clint_set; // [255:223] + cachepool_peripheral_reg2hw_cl_clint_clear_reg_t cl_clint_clear; // [222:190] + cachepool_peripheral_reg2hw_hw_barrier_reg_t hw_barrier; // [189:158] + cachepool_peripheral_reg2hw_icache_prefetch_enable_reg_t icache_prefetch_enable; // [157:157] + cachepool_peripheral_reg2hw_spatz_status_reg_t spatz_status; // [156:156] + cachepool_peripheral_reg2hw_spatz_cycle_reg_t spatz_cycle; // [155:124] + cachepool_peripheral_reg2hw_cluster_boot_control_reg_t cluster_boot_control; // [123:92] + cachepool_peripheral_reg2hw_cluster_eoc_exit_reg_t cluster_eoc_exit; // [91:88] cachepool_peripheral_reg2hw_cfg_l1d_spm_reg_t cfg_l1d_spm; // [87:78] cachepool_peripheral_reg2hw_cfg_l1d_insn_reg_t cfg_l1d_insn; // [77:76] cachepool_peripheral_reg2hw_cfg_l1d_tile_sel_reg_t cfg_l1d_tile_sel; // [75:44] diff --git a/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv b/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv index 7d3755e..c6ece73 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv +++ b/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv @@ -90,8 +90,8 @@ module cachepool_peripheral_reg_top #( logic [31:0] cluster_boot_control_qs; logic [31:0] cluster_boot_control_wd; logic cluster_boot_control_we; - logic cluster_eoc_exit_qs; - logic cluster_eoc_exit_wd; + logic [3:0] cluster_eoc_exit_qs; + logic [3:0] cluster_eoc_exit_wd; logic cluster_eoc_exit_we; logic [9:0] cfg_l1d_spm_qs; logic [9:0] cfg_l1d_spm_wd; @@ -337,9 +337,9 @@ module cachepool_peripheral_reg_top #( // R[cluster_eoc_exit]: V(False) prim_subreg #( - .DW (1), + .DW (4), .SWACCESS("RW"), - .RESVAL (1'h0) + .RESVAL (4'h0) ) u_cluster_eoc_exit ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -701,7 +701,7 @@ module cachepool_peripheral_reg_top #( assign cluster_boot_control_wd = reg_wdata[31:0]; assign cluster_eoc_exit_we = addr_hit[9] & reg_we & !reg_error; - assign cluster_eoc_exit_wd = reg_wdata[0]; + assign cluster_eoc_exit_wd = reg_wdata[3:0]; assign cfg_l1d_spm_we = addr_hit[10] & reg_we & !reg_error; assign cfg_l1d_spm_wd = reg_wdata[9:0]; @@ -773,7 +773,7 @@ module cachepool_peripheral_reg_top #( end addr_hit[9]: begin - reg_rdata_next[0] = cluster_eoc_exit_qs; + reg_rdata_next[3:0] = cluster_eoc_exit_qs; end addr_hit[10]: begin diff --git a/hardware/src/cachepool_cluster.sv b/hardware/src/cachepool_cluster.sv index ab2683b..df687a3 100644 --- a/hardware/src/cachepool_cluster.sv +++ b/hardware/src/cachepool_cluster.sv @@ -117,7 +117,7 @@ module cachepool_cluster /// corresponding core into debug mode. This signal is assumed to be _async_. input logic [NrCores-1:0] debug_req_i, /// End of Computing indicator to notify the host/tb - output logic eoc_o, + output logic [3:0] eoc_o, /// Machine external interrupt pending. Usually those interrupts come from a /// platform-level interrupt controller. This signal is assumed to be _async_. input logic [NrCores-1:0] meip_i, diff --git a/hardware/src/cachepool_group.sv b/hardware/src/cachepool_group.sv index 25fd93c..772626f 100644 --- a/hardware/src/cachepool_group.sv +++ b/hardware/src/cachepool_group.sv @@ -219,46 +219,63 @@ module cachepool_group // Tile remote access signals // In/Out relative to the tile (out--leave a tile; in--enter a tile) - tcdm_req_t [NumTiles-1:0][NrTCDMPortsPerCore-1:0] tile_remote_out_req; - tcdm_rsp_t [NumTiles-1:0][NrTCDMPortsPerCore-1:0] tile_remote_out_rsp; - logic [NumTiles-1:0][NrTCDMPortsPerCore-1:0] tile_remote_in_ready, tile_remote_out_ready; - tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTiles-1:0] tile_remote_out_req_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles-1:0] tile_remote_out_req_valid, tile_remote_out_req_ready; - tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTiles-1:0] tile_remote_out_rsp_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles-1:0] tile_remote_out_rsp_valid, tile_remote_out_rsp_ready; - - tcdm_req_t [NumTiles-1:0][NrTCDMPortsPerCore-1:0] tile_remote_in_req; - tcdm_rsp_t [NumTiles-1:0][NrTCDMPortsPerCore-1:0] tile_remote_in_rsp; - tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTiles-1:0] tile_remote_in_req_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles-1:0] tile_remote_in_req_valid, tile_remote_in_req_ready; - tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTiles-1:0] tile_remote_in_rsp_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles-1:0] tile_remote_in_rsp_valid, tile_remote_in_rsp_ready; - - // Symmetric xbar, in/out select types are the same - remote_tile_sel_t [NumTiles-1:0][NrTCDMPortsPerCore-1:0] remote_out_sel_tile, remote_in_sel_tile; - remote_tile_sel_t [NrTCDMPortsPerCore-1:0][NumTiles-1:0] remote_out_sel_xbar, remote_in_sel_xbar; + // Tile-side flat layout: index = j + r*NrTCDMPortsPerCore (j=xbar idx, r=remote slot within xbar) + tcdm_req_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_out_req; + tcdm_rsp_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_out_rsp; + logic [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_in_ready, tile_remote_out_ready; + + tcdm_req_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_in_req; + tcdm_rsp_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_in_rsp; + + // Xbar-side: NrTCDMPortsPerCore xbars, each with NumTiles*NumRemotePortCore ports + // Xbar port index = t*NumRemotePortCore + r + tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_req_chan; + logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_req_valid, tile_remote_out_req_ready; + tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_rsp_chan; + logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_rsp_valid, tile_remote_out_rsp_ready; + + tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_req_chan; + logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_req_valid, tile_remote_in_req_ready; + tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_rsp_chan; + logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_rsp_valid, tile_remote_in_rsp_ready; + + // Tile-side selection: narrow type, only carries tile_id + remote_tile_sel_t [NumTiles-1:0][NumRemotePortTile-1:0] remote_out_sel_tile; + // Xbar-side selection: wider type, encodes tile_id*NumRemotePortCore + core_id%NumRemotePortCore + remote_xbar_sel_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] remote_out_sel_xbar, remote_in_sel_xbar; for (genvar t = 0; t < NumTiles; t++) begin - for (genvar p = 0; p < NrTCDMPortsPerCore; p++) begin - assign tile_remote_out_req_chan [p][t] = tile_remote_out_req[t][p].q; - assign tile_remote_out_req_valid[p][t] = tile_remote_out_req[t][p].q_valid; - assign tile_remote_out_rsp_ready[p][t] = tile_remote_in_ready[t][p]; - - assign tile_remote_out_rsp[t][p].p = tile_remote_out_rsp_chan [p][t]; - assign tile_remote_out_rsp[t][p].p_valid = tile_remote_out_rsp_valid[p][t]; - assign tile_remote_out_rsp[t][p].q_ready = tile_remote_out_req_ready[p][t]; - - assign tile_remote_in_req[t][p].q = tile_remote_in_req_chan [p][t]; - assign tile_remote_in_req[t][p].q_valid = tile_remote_in_req_valid[p][t]; - assign tile_remote_out_ready[t][p] = tile_remote_in_rsp_ready[p][t]; - - assign tile_remote_in_rsp_chan [p][t] = tile_remote_in_rsp[t][p].p; - assign tile_remote_in_rsp_valid[p][t] = tile_remote_in_rsp[t][p].p_valid; - assign tile_remote_in_req_ready[p][t] = tile_remote_in_rsp[t][p].q_ready; - - // Selection signals - assign remote_out_sel_xbar[p][t] = remote_out_sel_tile[t][p]; - assign remote_in_sel_xbar [p][t] = tile_remote_in_rsp_chan[p][t].user.tile_id; + for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin + for (genvar r = 0; r < NumRemotePortCore; r++) begin + // tile flat index: j + r*NrTCDMPortsPerCore + // xbar port index: t*NumRemotePortCore + r + assign tile_remote_out_req_chan [j][t*NumRemotePortCore+r] = tile_remote_out_req[t][j+r*NrTCDMPortsPerCore].q; + assign tile_remote_out_req_valid[j][t*NumRemotePortCore+r] = tile_remote_out_req[t][j+r*NrTCDMPortsPerCore].q_valid; + assign tile_remote_out_rsp_ready[j][t*NumRemotePortCore+r] = tile_remote_in_ready[t][j+r*NrTCDMPortsPerCore]; + + assign tile_remote_out_rsp[t][j+r*NrTCDMPortsPerCore].p = tile_remote_out_rsp_chan [j][t*NumRemotePortCore+r]; + assign tile_remote_out_rsp[t][j+r*NrTCDMPortsPerCore].p_valid = tile_remote_out_rsp_valid[j][t*NumRemotePortCore+r]; + assign tile_remote_out_rsp[t][j+r*NrTCDMPortsPerCore].q_ready = tile_remote_out_req_ready[j][t*NumRemotePortCore+r]; + + assign tile_remote_in_req[t][j+r*NrTCDMPortsPerCore].q = tile_remote_in_req_chan [j][t*NumRemotePortCore+r]; + assign tile_remote_in_req[t][j+r*NrTCDMPortsPerCore].q_valid = tile_remote_in_req_valid[j][t*NumRemotePortCore+r]; + assign tile_remote_out_ready[t][j+r*NrTCDMPortsPerCore] = tile_remote_in_rsp_ready[j][t*NumRemotePortCore+r]; + + assign tile_remote_in_rsp_chan [j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].p; + assign tile_remote_in_rsp_valid[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].p_valid; + assign tile_remote_in_req_ready[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].q_ready; + + // Request selection: convert narrow tile_id to wide xbar index by appending + // core_id % NumRemotePortCore (available in the request channel user field) + assign remote_out_sel_xbar[j][t*NumRemotePortCore+r] = remote_xbar_sel_t'( + remote_out_sel_tile[t][j+r*NrTCDMPortsPerCore] * NumRemotePortCore + + tile_remote_out_req_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore); + + // Response selection: recover xbar port from tile_id and core_id in response user field + assign remote_in_sel_xbar[j][t*NumRemotePortCore+r] = remote_xbar_sel_t'( + tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.tile_id * NumRemotePortCore + + tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore); + end end end @@ -361,8 +378,8 @@ module cachepool_group // Decide which tile to go reqrsp_xbar #( - .NumInp (NumTiles ), - .NumOut (NumTiles ), + .NumInp (NumTiles * NumRemotePortCore ), + .NumOut (NumTiles * NumRemotePortCore ), .PipeReg (1'b1 ), .RspReg (1'b1 ), .ExtReqPrio (1'b0 ), @@ -392,5 +409,4 @@ module cachepool_group ); end - endmodule diff --git a/hardware/src/cachepool_pkg.sv b/hardware/src/cachepool_pkg.sv index 3546b02..737bc70 100644 --- a/hardware/src/cachepool_pkg.sv +++ b/hardware/src/cachepool_pkg.sv @@ -54,15 +54,18 @@ package cachepool_pkg; // How many cores for each tile? localparam int unsigned NumCoresTile = NumCores / NumTiles; - // How many remote ports for each tile? Currently needs to be 0 or 1. - // localparam int unsigned NumRemotePortTile = `ifdef NumRemotePortTile `NumRemotePortTile `else 0 `endif; - localparam int unsigned NumRemotePortTile = 1; + // How many remote ports for each tile per core's port? + localparam int unsigned NumRemotePortCore = `ifdef REMOTE_PORT_PER_CORE `REMOTE_PORT_PER_CORE `else 0 `endif; // How many cores within a tile? This is used to select the ports within a tile. localparam int unsigned LogNumCoresTile = $clog2(NumCoresTile); + // 4 ports from Spatz + 1 shared port from Snitch/FPU localparam int unsigned NrTCDMPortsPerCore = 5; + // How many remote ports for each tile in total? + localparam int unsigned NumRemotePortTile = NumRemotePortCore * NrTCDMPortsPerCore; + //////////////////// // CLUSTER HW // //////////////////// @@ -215,6 +218,9 @@ package cachepool_pkg; // Wide Data ports localparam int unsigned GroupWideDataPorts = NumL1CtrlTile; + // Correct selection width for remote xbar at group level + localparam int unsigned RemoteXbarSelWidth = $clog2(NumTiles * NumRemotePortCore); + /***** Cluster Ports *****/ // Narrow AXI ports: 1 In from SoC, 1 Out to UART localparam int unsigned ClusterNarrowInAxiPorts = 1; @@ -352,6 +358,13 @@ package cachepool_pkg; burst_req_t burst; } refill_user_t; + /////////////////// + // GROUP TYPES // + /////////////////// + + typedef logic [RemoteXbarSelWidth-1:0] remote_xbar_sel_t; + + ///////////////////// // CLUSTER TYPES // ///////////////////// diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index 38aa7e8..e01c0ac 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -147,20 +147,20 @@ module cachepool_tile output axi_narrow_req_t [1:0] axi_out_req_o, input axi_narrow_resp_t [1:0] axi_out_resp_i, /// Cache Refill ports - output cache_trans_req_t [NumL1CtrlTile-1:0] cache_refill_req_o, - input cache_trans_rsp_t [NumL1CtrlTile-1:0] cache_refill_rsp_i, + output cache_trans_req_t [NumL1CtrlTile-1:0] cache_refill_req_o, + input cache_trans_rsp_t [NumL1CtrlTile-1:0] cache_refill_rsp_i, /// Wide AXI ports to cluster level output axi_out_req_t [TileNarrowAxiPorts-1:0] axi_wide_req_o, input axi_out_resp_t [TileNarrowAxiPorts-1:0] axi_wide_rsp_i, /// Remote Tile access ports (to remote tiles) - output tcdm_req_t [NrTCDMPortsPerCore*NumRemotePortTile-1:0] remote_req_o, - output remote_tile_sel_t [NrTCDMPortsPerCore*NumRemotePortTile-1:0] remote_req_dst_o, - input tcdm_rsp_t [NrTCDMPortsPerCore*NumRemotePortTile-1:0] remote_rsp_i, - input logic [NrTCDMPortsPerCore*NumRemotePortTile-1:0] remote_rsp_ready_i, + output tcdm_req_t [NumRemotePortTile-1:0] remote_req_o, + output remote_tile_sel_t [NumRemotePortTile-1:0] remote_req_dst_o, + input tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_i, + input logic [NumRemotePortTile-1:0] remote_rsp_ready_i, /// Remote Tile access ports (from remote tiles) - input tcdm_req_t [NrTCDMPortsPerCore*NumRemotePortTile-1:0] remote_req_i, - output tcdm_rsp_t [NrTCDMPortsPerCore*NumRemotePortTile-1:0] remote_rsp_o, - output logic [NrTCDMPortsPerCore*NumRemotePortTile-1:0] remote_rsp_ready_o, + input tcdm_req_t [NumRemotePortTile-1:0] remote_req_i, + output tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_o, + output logic [NumRemotePortTile-1:0] remote_rsp_ready_o, /// Peripheral signals output icache_events_t [NrCores-1:0] icache_events_o, input logic icache_prefetch_enable_i, @@ -547,7 +547,9 @@ module cachepool_tile // Set through CSR logic [$clog2(TCDMAddrWidth)-1:0] dynamic_offset; assign dynamic_offset = dynamic_offset_i; - logic [NrTCDMPortsPerCore-1:0] remote_out_pready, remote_in_pready; + // One entry per flat remote port: flat index = j + r*NrTCDMPortsPerCore + // where j is the xbar index and r is the remote slot within that xbar. + logic [NumRemotePortTile-1:0] remote_out_pready, remote_in_pready; // Flush protection for remote ports. // @@ -560,38 +562,63 @@ module cachepool_tile // - remote_in_pready gated : stops response-ready from propagating back, // preventing in-flight completions during the flush window - tcdm_req_t [NrTCDMPortsPerCore*NumRemotePortTile-1:0] remote_req_gated; + tcdm_req_t [NumRemotePortTile-1:0] remote_req_gated; // Intermediate response signals from the xbar before q_ready gating. - tcdm_rsp_t [NrTCDMPortsPerCore*NumRemotePortTile-1:0] remote_rsp_xbar; + tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_xbar; always_comb begin : remote_flush_protection for (int j = 0; j < NrTCDMPortsPerCore; j++) begin - // Gate q_valid: prevent new requests entering the xbar. - remote_req_gated[j].q = remote_req_i[j].q; - remote_req_gated[j].q_valid = remote_req_i[j].q_valid && !l1d_busy_i; - - // Pass the full xbar response through, then gate only q_ready so the - // remote tile cannot complete a handshake during a flush. - remote_rsp_o[j] = remote_rsp_xbar[j]; - remote_rsp_o[j].q_ready = remote_rsp_xbar[j].q_ready && !l1d_busy_i; - - // Gate response-ready back to us: prevent draining completions - // of requests that arrived just before the flush. - remote_in_pready[j] = remote_rsp_ready_i[j] && !l1d_busy_i; + for (int r = 0; r < NumRemotePortCore; r++) begin + automatic int unsigned flat = j + r * NrTCDMPortsPerCore; + + // Gate q_valid: prevent new requests entering the xbar. + remote_req_gated[flat].q = remote_req_i[flat].q; + remote_req_gated[flat].q_valid = remote_req_i[flat].q_valid && !l1d_busy_i; + + // Pass the full xbar response through, then gate only q_ready so the + // remote tile cannot complete a handshake during a flush. + remote_rsp_o[flat] = remote_rsp_xbar[flat]; + remote_rsp_o[flat].q_ready = remote_rsp_xbar[flat].q_ready && !l1d_busy_i; + + // Gate response-ready back to us: prevent draining completions + // of requests that arrived just before the flush. + remote_in_pready[flat] = remote_rsp_ready_i[flat] && !l1d_busy_i; + end end end - // todo: multiple remote ports assign remote_rsp_ready_o = remote_out_pready; - /// Wire requests after strb handling to the cache controller + /// Wire requests after strb handling to the cache controller. + /// Each xbar j handles NumRemotePortCore remote slots at flat indices + /// j + r*NrTCDMPortsPerCore for r in [0, NumRemotePortCore). for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin : gen_cache_xbar + // Collect the NumRemotePortCore remote slots for this xbar. + tcdm_req_t [NumRemotePortCore-1:0] xbar_remote_req_gated; + tcdm_rsp_t [NumRemotePortCore-1:0] xbar_remote_rsp_xbar; + logic [NumRemotePortCore-1:0] xbar_remote_in_pready; + logic [NumRemotePortCore-1:0] xbar_remote_out_pready; + tcdm_rsp_t [NumRemotePortCore-1:0] xbar_remote_rsp_i; + remote_tile_sel_t [NumRemotePortCore-1:0] xbar_remote_req_dst; + tcdm_req_t [NumRemotePortCore-1:0] xbar_remote_req_o; + + for (genvar r = 0; r < NumRemotePortCore; r++) begin : gen_remote_slice + localparam int unsigned flat = j + r * NrTCDMPortsPerCore; + assign xbar_remote_req_gated [r] = remote_req_gated [flat]; + assign xbar_remote_in_pready [r] = remote_in_pready [flat]; + assign xbar_remote_rsp_i [r] = remote_rsp_i [flat]; + assign remote_rsp_xbar [flat] = xbar_remote_rsp_xbar [r]; + assign remote_out_pready [flat] = xbar_remote_out_pready[r]; + assign remote_req_dst_o [flat] = xbar_remote_req_dst [r]; + assign remote_req_o [flat] = xbar_remote_req_o [r]; + end + tcdm_cache_interco #( .NumTiles (NumTiles ), .NumCores (NrCores ), .NumCache (NumL1CtrlTile ), .NumTotCache (NumL1CacheCtrl ), - .NumRemotePort (NumRemotePortTile ), + .NumRemotePort (NumRemotePortCore ), .AddrWidth (TCDMAddrWidth ), .TileIDWidth (TileIDWidth ), .tcdm_req_t (tcdm_req_t ), @@ -599,19 +626,19 @@ module cachepool_tile .tcdm_req_chan_t (tcdm_req_chan_t ), .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) ) i_cache_xbar ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .tile_id_i ( tile_id_i ), - .dynamic_offset_i ( dynamic_offset ), - .private_start_addr_i ( private_start_addr_i ), - .num_private_cache_i ( num_private_cache ), - .core_req_i ({remote_req_gated [j], cache_req [j]} ), - .core_rsp_ready_i ({remote_in_pready [j], cache_pready [j]} ), - .core_rsp_o ({remote_rsp_xbar [j], cache_rsp [j]} ), - .tile_sel_o ( remote_req_dst_o [j] ), - .mem_req_o ({remote_req_o [j], cache_xbar_req [j]} ), - .mem_rsp_ready_o ({remote_out_pready[j], cache_xbar_pready[j]} ), - .mem_rsp_i ({remote_rsp_i [j], cache_xbar_rsp [j]} ) + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tile_id_i ( tile_id_i ), + .dynamic_offset_i ( dynamic_offset ), + .private_start_addr_i ( private_start_addr_i ), + .num_private_cache_i ( num_private_cache ), + .core_req_i ({xbar_remote_req_gated, cache_req [j]} ), + .core_rsp_ready_i ({xbar_remote_in_pready, cache_pready [j]} ), + .core_rsp_o ({xbar_remote_rsp_xbar, cache_rsp [j]} ), + .tile_sel_o ( xbar_remote_req_dst ), + .mem_req_o ({xbar_remote_req_o, cache_xbar_req [j]} ), + .mem_rsp_ready_o ({xbar_remote_out_pready, cache_xbar_pready[j]} ), + .mem_rsp_i ({xbar_remote_rsp_i, cache_xbar_rsp [j]} ) ); end diff --git a/hardware/src/tcdm_cache_interco.sv b/hardware/src/tcdm_cache_interco.sv index 397c2d5..b79a8b3 100644 --- a/hardware/src/tcdm_cache_interco.sv +++ b/hardware/src/tcdm_cache_interco.sv @@ -77,6 +77,7 @@ module tcdm_cache_interco #( /// Memory side ------------------------------------------------------- /// Which remote tile is targeted (one entry per remote output port). output tile_id_t [NumRemotePort-1:0] tile_sel_o, + // output logic remote_group_o, /// Requests to cache banks and remote output ports. output tcdm_req_t [NumCache+NumRemotePort-1:0] mem_req_o, /// Response ready out. @@ -239,7 +240,7 @@ module tcdm_cache_interco #( (core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth] == tile_id_i); core_req_sel[port] = local_sel[port] ? core_sel_t'(addr_bank) - : core_sel_t'(NumCache); + : core_sel_t'(NumCache + (port % NumRemotePort)); end else begin // Mixed: fold addr_bank into the appropriate partition via modulo. @@ -255,7 +256,7 @@ module tcdm_cache_interco #( (core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth] == tile_id_i); core_req_sel[port] = local_sel[port] ? core_sel_t'(num_private_cache_q + (addr_bank % num_shared_cache_q)) - : core_sel_t'(NumCache); + : core_sel_t'(NumCache + (port % NumRemotePort)); end end end @@ -270,7 +271,9 @@ module tcdm_cache_interco #( mem_rsp_sel[port] = mem_rsp[port].user.core_id; if (mem_rsp[port].user.tile_id != tile_id_i) begin // Response from a remote tile: forward to the remote interco port. - mem_rsp_sel[port] = mem_sel_t'(NumCores); + // Use core_id % NumRemotePort to select the correct remote-in channel, + // consistent with the request-side mapping (port % NumRemotePort). + mem_rsp_sel[port] = mem_sel_t'(NumCores + (mem_rsp[port].user.core_id % NumRemotePort)); end end end diff --git a/hardware/tb/cachepool_cluster_wrapper.sv b/hardware/tb/cachepool_cluster_wrapper.sv index 2914e42..a9dba20 100644 --- a/hardware/tb/cachepool_cluster_wrapper.sv +++ b/hardware/tb/cachepool_cluster_wrapper.sv @@ -31,7 +31,7 @@ module cachepool_cluster_wrapper )( input logic clk_i, input logic rst_ni, - output logic eoc_o, + output logic [3:0] eoc_o, input logic debug_req_i, input logic meip_i, diff --git a/hardware/tb/tb_cachepool.sv b/hardware/tb/tb_cachepool.sv index 153a29d..7da3e78 100644 --- a/hardware/tb/tb_cachepool.sv +++ b/hardware/tb/tb_cachepool.sv @@ -42,7 +42,7 @@ module tb_cachepool; logic clk; logic rst_n; - logic eoc; + logic [3:0] eoc; // Toggling the clock always #(ClockPeriod/2) clk = !clk; @@ -217,8 +217,8 @@ module tb_cachepool; debug_req = '0; // Wait for end of computing signal - wait (eoc); - $display("[EOC] Simulation ended at %t (retval = WIP).", $time); + wait (eoc[0]); + $display("[EOC] Simulation ended at %t (retval = %u).", $time, eoc[3:1]); $finish(0); end diff --git a/software/snRuntime/include/cachepool_peripheral.h b/software/snRuntime/include/cachepool_peripheral.h index ec3c021..aad79f5 100644 --- a/software/snRuntime/include/cachepool_peripheral.h +++ b/software/snRuntime/include/cachepool_peripheral.h @@ -70,7 +70,12 @@ extern "C" { // End of computation and exit status register #define CACHEPOOL_PERIPHERAL_CLUSTER_EOC_EXIT_REG_OFFSET 0x24 -#define CACHEPOOL_PERIPHERAL_CLUSTER_EOC_EXIT_EOC_EXIT_BIT 0 +#define CACHEPOOL_PERIPHERAL_CLUSTER_EOC_EXIT_EOC_EXIT_MASK 0xf +#define CACHEPOOL_PERIPHERAL_CLUSTER_EOC_EXIT_EOC_EXIT_OFFSET 0 +#define CACHEPOOL_PERIPHERAL_CLUSTER_EOC_EXIT_EOC_EXIT_FIELD \ + ((bitfield_field32_t){ \ + .mask = CACHEPOOL_PERIPHERAL_CLUSTER_EOC_EXIT_EOC_EXIT_MASK, \ + .index = CACHEPOOL_PERIPHERAL_CLUSTER_EOC_EXIT_EOC_EXIT_OFFSET}) // Controls the configurations of L1 DCache SPM size. #define CACHEPOOL_PERIPHERAL_CFG_L1D_SPM_REG_OFFSET 0x28 diff --git a/software/snRuntime/include/l1cache.h b/software/snRuntime/include/l1cache.h index 651c19f..ecde97b 100644 --- a/software/snRuntime/include/l1cache.h +++ b/software/snRuntime/include/l1cache.h @@ -20,5 +20,6 @@ void l1d_flush(); void l1d_wait(); void l1d_spm_config (uint32_t size); void l1d_part (uint32_t size); +void l1d_addr (uint32_t addr); -void set_eoc(); +void set_eoc (uint32_t eoc_value); diff --git a/software/snRuntime/src/l1cache.c b/software/snRuntime/src/l1cache.c index 7c69e27..79700d1 100644 --- a/software/snRuntime/src/l1cache.c +++ b/software/snRuntime/src/l1cache.c @@ -142,9 +142,10 @@ void l1d_addr (uint32_t addr) { l1d_commit(); } -void set_eoc () { +void set_eoc (uint32_t eoc_value) { volatile uint32_t *eoc_reg = - (uint32_t *)(_snrt_team_current->root->cluster_mem.end + - CACHEPOOL_PERIPHERAL_CLUSTER_EOC_EXIT_REG_OFFSET); - *eoc_reg = 1; -} + (uint32_t *)(_snrt_team_current->root->cluster_mem.end + + CACHEPOOL_PERIPHERAL_CLUSTER_EOC_EXIT_REG_OFFSET); + // Value is already encoded by caller, write directly + *eoc_reg = eoc_value; +} \ No newline at end of file diff --git a/software/snRuntime/src/platforms/standalone/start_snitch.S b/software/snRuntime/src/platforms/standalone/start_snitch.S index 2d73bb3..1046ae2 100644 --- a/software/snRuntime/src/platforms/standalone/start_snitch.S +++ b/software/snRuntime/src/platforms/standalone/start_snitch.S @@ -29,7 +29,9 @@ _snrt_exit: ori t0, t0, 1 la t1, tohost sw t0, 0(t1) + mv s0, t0 call l1d_flush + mv a0, s0 call set_eoc 1: ret diff --git a/software/tests/byte-enable/main.c b/software/tests/byte-enable/main.c index e0e37b6..7266687 100644 --- a/software/tests/byte-enable/main.c +++ b/software/tests/byte-enable/main.c @@ -447,9 +447,5 @@ int main(void) { snrt_cluster_hw_barrier(); - if (core_id == 0) { - set_eoc(); - } - return 0; } diff --git a/software/tests/fmatmul-32b/main.c b/software/tests/fmatmul-32b/main.c index 3a56ad5..abaf908 100644 --- a/software/tests/fmatmul-32b/main.c +++ b/software/tests/fmatmul-32b/main.c @@ -207,7 +207,6 @@ int main() { // Wait for all cores to finish snrt_cluster_hw_barrier(); - set_eoc(); return 0; } diff --git a/software/tests/mcs-lock/main.c b/software/tests/mcs-lock/main.c index 84ab030..b273d81 100644 --- a/software/tests/mcs-lock/main.c +++ b/software/tests/mcs-lock/main.c @@ -195,6 +195,5 @@ int main(void) { // Wait for all cores to finish snrt_cluster_hw_barrier(); // this can trigger Misaligned Load exception - set_eoc(); return 0; } diff --git a/software/tests/multi_producer_single_consumer_double_linked_list/main.c b/software/tests/multi_producer_single_consumer_double_linked_list/main.c index 6e3d7dc..31740e8 100644 --- a/software/tests/multi_producer_single_consumer_double_linked_list/main.c +++ b/software/tests/multi_producer_single_consumer_double_linked_list/main.c @@ -81,6 +81,5 @@ int main(void) { // Wait for all cores to finish snrt_cluster_hw_barrier(); // this can trigger Misaligned Load exception - set_eoc(); return 0; } diff --git a/software/tests/spin-lock/main.c b/software/tests/spin-lock/main.c index e4cfa6f..0c8059f 100644 --- a/software/tests/spin-lock/main.c +++ b/software/tests/spin-lock/main.c @@ -57,7 +57,6 @@ int main() { // Wait for core 0 to finish displaying results snrt_cluster_hw_barrier(); - set_eoc(); return 0; }