pulp-platform · DiyouS · Apr 21, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/Bender.lock b/Bender.lock
@@ -97,7 +97,7 @@ packages:
     - common_cells
     - tech_cells_generic
   spatz:
-    revision: 94ff5f6ca70e4dfef6168e0ac25b0bdd88e40132
+    revision: ed25c78dd72d839db8141287f9516d78ee399b93
     version: null
     source:
       Git: https://github.com/pulp-platform/spatz.git

diff --git a/hardware/src/cachepool_group.sv b/hardware/src/cachepool_group.sv
@@ -265,16 +265,20 @@ module cachepool_group
         assign tile_remote_in_rsp_valid[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].p_valid;
         assign tile_remote_in_req_ready[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].q_ready;
 
-        // Request selection: convert narrow tile_id to wide xbar index by appending
-        // core_id % NumRemotePortCore (available in the request channel user field)
+        // Request selection: route to target tile's remote-in slot based on
+        // target tile ID, so that all requests to the same destination tile
+        // travel through one pipeline — preserving write-before-read ordering.
         assign remote_out_sel_xbar[j][t*NumRemotePortCore+r] = remote_xbar_sel_t'(
             remote_out_sel_tile[t][j+r*NrTCDMPortsPerCore] * NumRemotePortCore
-          + tile_remote_out_req_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore);
+          + remote_out_sel_tile[t][j+r*NrTCDMPortsPerCore] % NumRemotePortCore);
 
-        // Response selection: recover xbar port from tile_id and core_id in response user field
+        // Response selection: route back to source tile's remote-out slot.
+        // The originator (tile_id in user field) sent on slot
+        // (target_tile % NumRemotePortCore).  The responding tile is `t`
+        // (genvar), so target_tile = t.
         assign remote_in_sel_xbar[j][t*NumRemotePortCore+r] = remote_xbar_sel_t'(
             tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.tile_id * NumRemotePortCore
-          + tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore);
+          + t % NumRemotePortCore);
       end
     end
   end

diff --git a/hardware/src/tcdm_cache_interco.sv b/hardware/src/tcdm_cache_interco.sv
@@ -218,6 +218,7 @@ module tcdm_cache_interco #(
 
   for (genvar port = 0; port < NumCores+NumRemotePort; port++) begin : gen_req_sel
     logic [CacheBankBits-1:0] addr_bank;
+    logic [TileIDWidth-1:0]   addr_tile;
 
     always_comb begin
       // Defaults.
@@ -226,6 +227,8 @@ module tcdm_cache_interco #(
 
       // Extract the raw BankSel field from the address.
       addr_bank = core_req[port].addr[dynamic_offset_i +: CacheBankBits];
+      // Extract the target TileID from the address (used for remote port selection).
+      addr_tile = core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth];
 
       if (num_private_cache_q == ($clog2(NumCache)+1)'(NumCache) || NumTiles == 1) begin
         // All-private or single-tile: every request is local.
@@ -236,11 +239,13 @@ module tcdm_cache_interco #(
       end else if (num_private_cache_q == '0) begin
         // All-shared: check TileID to decide local vs. remote.
         // Use the full BankSel field directly (no folding needed).
-        local_sel[port] =
-          (core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth] == tile_id_i);
+        local_sel[port] = (addr_tile == tile_id_i);
+        // Route remote requests by target tile ID so that all accesses to the
+        // same tile share a single pipeline, preserving write-before-read
+        // ordering across barriers.
         core_req_sel[port] = local_sel[port]
                            ? core_sel_t'(addr_bank)
-                           : core_sel_t'(NumCache + (port % NumRemotePort));
+                           : core_sel_t'(NumCache + (addr_tile % NumRemotePort));
 
       end else begin
         // Mixed: fold addr_bank into the appropriate partition via modulo.
@@ -252,11 +257,10 @@ module tcdm_cache_interco #(
         end else begin
           // Shared request: check TileID to decide local vs. remote.
           // bank = num_private_cache_q + (addr_bank % num_shared_cache_q).
-          local_sel[port] =
-            (core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth] == tile_id_i);
+          local_sel[port] = (addr_tile == tile_id_i);
           core_req_sel[port] = local_sel[port]
                              ? core_sel_t'(num_private_cache_q + (addr_bank % num_shared_cache_q))
-                             : core_sel_t'(NumCache + (port % NumRemotePort));
+                             : core_sel_t'(NumCache + (addr_tile % NumRemotePort));
         end
       end
     end
@@ -270,10 +274,11 @@ module tcdm_cache_interco #(
     always_comb begin
       mem_rsp_sel[port] = mem_rsp[port].user.core_id;
       if (mem_rsp[port].user.tile_id != tile_id_i) begin
-        // Response from a remote tile: forward to the remote interco port.
-        // Use core_id % NumRemotePort to select the correct remote-in channel,
-        // consistent with the request-side mapping (port % NumRemotePort).
-        mem_rsp_sel[port] = mem_sel_t'(NumCores + (mem_rsp[port].user.core_id % NumRemotePort));
+        // Response destined for a remote tile: forward to the remote interco
+        // port that matches the incoming request path.  The group-level xbar
+        // routes requests from source tile S to our remote-in slot
+        // (S % NumRemotePort), so responses must return via the same slot.
+        mem_rsp_sel[port] = mem_sel_t'(NumCores + (mem_rsp[port].user.tile_id % NumRemotePort));
       end
     end
   end

diff --git a/software/tests/CMakeLists.txt b/software/tests/CMakeLists.txt
@@ -114,6 +114,9 @@ add_spatz_test_threeParam(fmatmul-32b fmatmul-32b/main.c 32 32 32)
 add_spatz_test_threeParam(fmatmul-32b fmatmul-32b/main.c 64 64 64)
 add_spatz_test_threeParam(fmatmul-32b fmatmul-32b/main.c 128 128 128)
 
+add_spatz_test_twoParam(fft-32b fft-32b/main.c 256 4)
+add_spatz_test_twoParam(fft-32b fft-32b/main.c 1024 16)
+
 ### Integer-Point
 add_spatz_test_oneParam(idotp-32b idotp-32b/main.c 8192)
 add_spatz_test_oneParam(idotp-32b idotp-32b/main.c 32768)

diff --git a/software/tests/fft-32b/data/data_1024_16.h b/software/tests/fft-32b/data/data_1024_16.h
diff --git a/software/tests/fft-32b/data/data_256_4.h b/software/tests/fft-32b/data/data_256_4.h
diff --git a/software/tests/fft-32b/kernel/fft.c b/software/tests/fft-32b/kernel/fft.c
@@ -0,0 +1,254 @@
+// Copyright 2026 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Diyou Shen, Matteo Perotti, ETH Zurich
+
+#include "fft.h"
+
+/* Phase 1 of vector FFT on muti-core */
+/* This function would be run several times in main with barrier*/
+
+void fft_p1 (float *src, float *buf, const float *twi,
+             const uint32_t nfft,  const uint32_t ntwi,
+             const uint32_t cid,   const uint32_t num_cores,
+             const uint32_t stage, const uint32_t len) {
+
+  size_t avl = (size_t) len;
+  size_t vl;
+
+  const float *re_twi, *im_twi;
+  const float *re_u_i, *im_u_i, *re_l_i, *im_l_i;
+  float *re_u_o, *im_u_o, *re_l_o, *im_l_o;
+
+  const float *i_buf;
+  float *o_buf;
+  i_buf = src;
+  o_buf = buf;
+
+  // Is there a more efficient way to calculate these pointers?
+  for (uint32_t group = 0; group < (1 << stage); group ++) {
+    // divide cores into 2^(stage) groups
+    if (cid < ((num_cores >> stage)*(group+1))) {
+      // stage 0: 1 group, 0;
+      // stage 1: 2 group, 0 : nfft/2;
+      // stage 2: 4 group, 0 : nfft/4 : 2*nfft/4 : 3*nfft/4;
+      // ......
+
+      // offset for different groups
+      uint32_t offset = (nfft >> stage) * group;
+      uint32_t idx    = cid - (num_cores >> stage) * group;
+
+      // inputs pointer
+      re_u_i = i_buf  + offset + idx * len;
+      re_l_i = re_u_i + (nfft >> (stage + 1));
+      im_u_i = re_u_i + nfft;
+      im_l_i = re_l_i + nfft;
+
+      // output pointer
+      re_u_o = o_buf  + offset + idx * len;
+      re_l_o = re_u_o + (nfft >> (stage + 1));
+      im_u_o = re_u_o + nfft;
+      im_l_o = re_l_o + nfft;
+
+      // twiddle pointer
+      // twiddle will not need add group offset
+      // main will jump to next twiddle
+      // each group will have the same twiddle
+      re_twi = twi    + idx * len;
+      im_twi = re_twi + ntwi;
+
+      // Once the core gets the pointer, it needs to leave the loop
+      break;
+    }
+  }
+
+  // initial value of avl has been calculated earlier
+  for (; avl > 0; avl -= vl) {
+    // re_u_o = re_u_i + re_l_i;
+    // im_u_o = im_u_i + im_l_i;
+    // re_l_o = (re_u_i - re_l_i) * re_twi - (im_u_i - im_l_i) * im_twi;
+    // im_l_o = (re_u_i - re_l_i) * im_twi + (im_u_i - im_l_i) * re_twi;
+    asm volatile("vsetvli %0, %1, e32, m4, ta, ma" : "=r"(vl) : "r"(avl));
+
+    asm volatile("vle32.v v0, (%0);" ::"r"(re_u_i)); // v0: Re upper wing
+    re_u_i += vl;
+    asm volatile("vle32.v v4, (%0);" ::"r"(re_l_i)); // v4: Re lower wing
+    re_l_i += vl;
+    asm volatile("vfadd.vv v16, v0, v4"); // v16: Re butterfly output upper wing
+    asm volatile("vfsub.vv v0, v0, v4");  // v0: Re butterfly output upper wing
+
+    asm volatile("vle32.v v8, (%0);" ::"r"(im_u_i)); // v8: Im upper wing
+    im_u_i += vl;
+    asm volatile("vle32.v v12, (%0);" ::"r"(im_l_i)); // v12: Im lower wing
+    im_l_i += vl;
+
+    asm volatile("vfadd.vv v20, v8, v12"); // v20: Im butterfly output upper wing
+
+    asm volatile("vfsub.vv v4, v8, v12"); // v4: Im butterfly output upper wing
+
+    // Load the twiddle vector
+    asm volatile("vle32.v v8, (%0);" ::"r"(re_twi)); // v8: Re twi
+    re_twi += vl;
+    asm volatile("vle32.v v12, (%0);" ::"r"(im_twi)); // v12: Im twi
+    im_twi += vl;
+
+    // Twiddle the lower wing
+    // re_l_o = - v0 * v8  - v4 * v12
+    // im_l_o =   v0 * v12 + v4 * v8
+    // Store 1:1 the output result
+    // Sequence do not need to shuffle in first phase
+    asm volatile("vfmul.vv v24, v0, v8");
+    asm volatile("vfnmsac.vv v24, v4, v12"); // v24: Re butterfly output
+                                             // twiddled lower wing
+    asm volatile("vse32.v v16, (%0)" ::"r"(re_u_o));
+    re_u_o += vl;
+    asm volatile("vse32.v v20, (%0)" ::"r"(im_u_o));
+    im_u_o += vl;
+    asm volatile("vfmul.vv v28, v0, v12");
+    asm volatile("vfmacc.vv v28, v4, v8"); // v28: Im butterfly output
+                                           // twiddled lower wing
+
+    asm volatile("vse32.v v24, (%0)" ::"r"(re_l_o));
+    re_l_o += vl;
+    asm volatile("vse32.v v28, (%0)" ::"r"(im_l_o));
+    im_l_o += vl;
+  }
+}
+
+/* Phase 2 of vector FFT on muti-core */
+// DIF Cooley-Tukey algorithm
+// At every iteration, we store indexed
+void fft_p2(float *s, float *buf, const float *twi, float *out,
+            const uint16_t *seq_idx, const uint32_t nfft,
+            const uint32_t nfft_ori, const uint32_t log2_nfft,
+            const uint32_t stride,   const uint32_t stride_e, const uint32_t ntwi) {
+
+  // Always run in dual-core mode
+
+  // Real part of the twiddles
+  const float *re_twi = twi;
+  // Img part of the twiddles
+  const float *im_twi = twi + ntwi;
+
+  // Keep half of the samples in a vector register
+  size_t avl;
+  size_t vl;
+
+  // Loop through the butterfly stages
+  for (uint32_t bf = 0; bf < log2_nfft; ++bf) {
+    // Keep half of the samples in a vector register
+    avl = nfft >> 1;
+    // Swap between the two buffers
+    const float *i_buf;
+    float *o_buf;
+    i_buf = !(bf & 1) ? buf : s;
+    o_buf = !(bf & 1) ? s : buf;
+
+    // Last iteration
+    if (bf == log2_nfft - 1)
+      o_buf = buf;
+
+    // Update pointers
+    const float *re_u_i = i_buf;
+    const float *im_u_i = i_buf + nfft_ori;
+    const float *re_l_i = re_u_i + (nfft >> 1);
+    const float *im_l_i = im_u_i + (nfft >> 1);
+    float *re_u_o = o_buf;
+    float *im_u_o = o_buf + nfft_ori;
+    float *re_l_o = re_u_o + (nfft >> 1);
+    float *im_l_o = im_u_o + (nfft >> 1);
+
+    float *re_u_s = out;
+    float *im_u_s = out + nfft_ori;
+    float *re_l_s = re_u_s + (nfft_ori >> 1);
+    float *im_l_s = im_u_s + (nfft_ori >> 1);
+
+    // Stripmine the whole vector for this butterfly stage
+    for (; avl > 0; avl -= vl) {
+      // re_u_o = re_u_i + re_l_i;
+      // im_u_o = im_u_i + im_l_i;
+      // re_l_o = (re_u_i - re_l_i) * re_twi - (im_u_i - im_l_i) * im_twi;
+      // im_l_o = (re_u_i - re_l_i) * im_twi + (im_u_i - im_l_i) * re_twi;
+      // Stripmine
+      // Group 4 registers as a larger register to improve performance
+      asm volatile("vsetvli %0, %1, e32, m4, ta, ma" : "=r"(vl) : "r"(avl));
+
+      // 2 load/store with 2 calc sometimes gives a better performance (window of 4 insn)
+      asm volatile("vle32.v v0, (%0);" ::"r"(re_u_i)); // v0: Re upper wing
+      re_u_i += vl;
+      asm volatile("vle32.v v4, (%0);" ::"r"(re_l_i)); // v4: Re lower wing
+      re_l_i += vl;
+
+      asm volatile("vfadd.vv v16, v0, v4"); // v16: Re butterfly output upper wing
+      asm volatile("vfsub.vv v0, v0, v4");  // v0: Re butterfly output upper wing
+
+      asm volatile("vle32.v v8, (%0);" ::"r"(im_u_i)); // v8: Im upper wing
+      im_u_i += vl;
+      asm volatile("vle32.v v12, (%0);" ::"r"(im_l_i)); // v12: Im lower wing
+      im_l_i += vl;
+
+      asm volatile("vfadd.vv v20, v8, v12"); // v20: Im butterfly output upper wing
+      asm volatile("vfsub.vv v4, v8, v12"); // v4: Im butterfly output upper wing
+
+      // Load the index vector. If last step, do strided store
+      // Otherwise, it's the helper index for the permutations (this is a mask
+      // vector)
+      if (bf == log2_nfft - 1) {
+        // Last store is a strided pattern
+        // use strided store instead of index store to improve performance
+        asm volatile("vsse32.v v16, (%0), %1" ::"r"(re_u_s),"r"(stride));
+        asm volatile("vsse32.v v20, (%0), %1" ::"r"(im_u_s),"r"(stride));
+        asm volatile("vsse32.v v0,  (%0), %1" ::"r"(re_l_s),"r"(stride));
+        asm volatile("vsse32.v v4,  (%0), %1" ::"r"(im_l_s),"r"(stride));
+        // update the store pointer
+        re_u_s += (vl << stride_e);
+        im_u_s += (vl << stride_e);
+        re_l_s += (vl << stride_e);
+        im_l_s += (vl << stride_e);
+      } else {
+        // TODO: Actually, there is no need to st then ld
+        // Can we add an insn to shuffle the elem position?
+        // Load the twiddle vector
+        asm volatile("vle32.v v8, (%0);" ::"r"(re_twi)); // v8: Re twi
+        re_twi += vl;
+        asm volatile("vle32.v v12, (%0);" ::"r"(im_twi)); // v12: Im twi
+        im_twi += vl;
+
+        // Twiddle the lower wing
+        // re_l_o = - v0 * v8  - v4 * v12
+        // im_l_o =   v0 * v12 + v4 * v8
+        asm volatile("vfmul.vv v24, v0, v8");
+        asm volatile("vfnmsac.vv v24, v4, v12"); // v24: Re butterfly output
+                                                 // twiddled lower wing
+        asm volatile("vfmul.vv v28, v0, v12");
+        asm volatile("vfmacc.vv v28, v4, v8"); // v28: Im butterfly output
+                                               // twiddled lower wing
+        // Load the sequential indices dirctly
+        asm volatile("vle16.v v12, (%0)" ::"r"(seq_idx)); // v24: index vector
+        seq_idx += vl;
+        re_u_o = o_buf;
+        im_u_o = o_buf + nfft_ori;
+        re_l_o = re_u_o + (nfft >> 2);
+        im_l_o = im_u_o + (nfft >> 2);
+
+        asm volatile("vsuxei16.v v16, (%0), v12" ::"r"(re_u_o));
+        asm volatile("vsuxei16.v v20, (%0), v12" ::"r"(im_u_o));
+        asm volatile("vsuxei16.v v24, (%0), v12" ::"r"(re_l_o));
+        asm volatile("vsuxei16.v v28, (%0), v12" ::"r"(im_l_o));
+      }
+    }
+  }
+}