Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Bender.lock
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ packages:
- common_cells
- tech_cells_generic
spatz:
revision: 94ff5f6ca70e4dfef6168e0ac25b0bdd88e40132
revision: ed25c78dd72d839db8141287f9516d78ee399b93
version: null
source:
Git: https://github.com/pulp-platform/spatz.git
Expand Down
14 changes: 9 additions & 5 deletions hardware/src/cachepool_group.sv
Original file line number Diff line number Diff line change
Expand Up @@ -265,16 +265,20 @@ module cachepool_group
assign tile_remote_in_rsp_valid[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].p_valid;
assign tile_remote_in_req_ready[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].q_ready;

// Request selection: convert narrow tile_id to wide xbar index by appending
// core_id % NumRemotePortCore (available in the request channel user field)
// Request selection: route to target tile's remote-in slot based on
// target tile ID, so that all requests to the same destination tile
// travel through one pipeline — preserving write-before-read ordering.
assign remote_out_sel_xbar[j][t*NumRemotePortCore+r] = remote_xbar_sel_t'(
remote_out_sel_tile[t][j+r*NrTCDMPortsPerCore] * NumRemotePortCore
+ tile_remote_out_req_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore);
+ remote_out_sel_tile[t][j+r*NrTCDMPortsPerCore] % NumRemotePortCore);

// Response selection: recover xbar port from tile_id and core_id in response user field
// Response selection: route back to source tile's remote-out slot.
// The originator (tile_id in user field) sent on slot
// (target_tile % NumRemotePortCore). The responding tile is `t`
// (genvar), so target_tile = t.
assign remote_in_sel_xbar[j][t*NumRemotePortCore+r] = remote_xbar_sel_t'(
tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.tile_id * NumRemotePortCore
+ tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore);
+ t % NumRemotePortCore);
end
end
end
Expand Down
25 changes: 15 additions & 10 deletions hardware/src/tcdm_cache_interco.sv
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ module tcdm_cache_interco #(

for (genvar port = 0; port < NumCores+NumRemotePort; port++) begin : gen_req_sel
logic [CacheBankBits-1:0] addr_bank;
logic [TileIDWidth-1:0] addr_tile;

always_comb begin
// Defaults.
Expand All @@ -226,6 +227,8 @@ module tcdm_cache_interco #(

// Extract the raw BankSel field from the address.
addr_bank = core_req[port].addr[dynamic_offset_i +: CacheBankBits];
// Extract the target TileID from the address (used for remote port selection).
addr_tile = core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth];

if (num_private_cache_q == ($clog2(NumCache)+1)'(NumCache) || NumTiles == 1) begin
// All-private or single-tile: every request is local.
Expand All @@ -236,11 +239,13 @@ module tcdm_cache_interco #(
end else if (num_private_cache_q == '0) begin
// All-shared: check TileID to decide local vs. remote.
// Use the full BankSel field directly (no folding needed).
local_sel[port] =
(core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth] == tile_id_i);
local_sel[port] = (addr_tile == tile_id_i);
// Route remote requests by target tile ID so that all accesses to the
// same tile share a single pipeline, preserving write-before-read
// ordering across barriers.
core_req_sel[port] = local_sel[port]
? core_sel_t'(addr_bank)
: core_sel_t'(NumCache + (port % NumRemotePort));
: core_sel_t'(NumCache + (addr_tile % NumRemotePort));

end else begin
// Mixed: fold addr_bank into the appropriate partition via modulo.
Expand All @@ -252,11 +257,10 @@ module tcdm_cache_interco #(
end else begin
// Shared request: check TileID to decide local vs. remote.
// bank = num_private_cache_q + (addr_bank % num_shared_cache_q).
local_sel[port] =
(core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth] == tile_id_i);
local_sel[port] = (addr_tile == tile_id_i);
core_req_sel[port] = local_sel[port]
? core_sel_t'(num_private_cache_q + (addr_bank % num_shared_cache_q))
: core_sel_t'(NumCache + (port % NumRemotePort));
: core_sel_t'(NumCache + (addr_tile % NumRemotePort));
end
end
end
Expand All @@ -270,10 +274,11 @@ module tcdm_cache_interco #(
always_comb begin
mem_rsp_sel[port] = mem_rsp[port].user.core_id;
if (mem_rsp[port].user.tile_id != tile_id_i) begin
// Response from a remote tile: forward to the remote interco port.
// Use core_id % NumRemotePort to select the correct remote-in channel,
// consistent with the request-side mapping (port % NumRemotePort).
mem_rsp_sel[port] = mem_sel_t'(NumCores + (mem_rsp[port].user.core_id % NumRemotePort));
// Response destined for a remote tile: forward to the remote interco
// port that matches the incoming request path. The group-level xbar
// routes requests from source tile S to our remote-in slot
// (S % NumRemotePort), so responses must return via the same slot.
mem_rsp_sel[port] = mem_sel_t'(NumCores + (mem_rsp[port].user.tile_id % NumRemotePort));
end
end
end
Expand Down
3 changes: 3 additions & 0 deletions software/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ add_spatz_test_threeParam(fmatmul-32b fmatmul-32b/main.c 32 32 32)
add_spatz_test_threeParam(fmatmul-32b fmatmul-32b/main.c 64 64 64)
add_spatz_test_threeParam(fmatmul-32b fmatmul-32b/main.c 128 128 128)

add_spatz_test_twoParam(fft-32b fft-32b/main.c 256 4)
add_spatz_test_twoParam(fft-32b fft-32b/main.c 1024 16)

### Integer-Point
add_spatz_test_oneParam(idotp-32b idotp-32b/main.c 8192)
add_spatz_test_oneParam(idotp-32b idotp-32b/main.c 32768)
Expand Down
22 changes: 22 additions & 0 deletions software/tests/fft-32b/data/data_1024_16.h

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions software/tests/fft-32b/data/data_256_4.h

Large diffs are not rendered by default.

254 changes: 254 additions & 0 deletions software/tests/fft-32b/kernel/fft.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
// Copyright 2026 ETH Zurich and University of Bologna.
//
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Author: Diyou Shen, Matteo Perotti, ETH Zurich

#include "fft.h"

/* Phase 1 of vector FFT on muti-core */
/* This function would be run several times in main with barrier*/

void fft_p1 (float *src, float *buf, const float *twi,
const uint32_t nfft, const uint32_t ntwi,
const uint32_t cid, const uint32_t num_cores,
const uint32_t stage, const uint32_t len) {

size_t avl = (size_t) len;
size_t vl;

const float *re_twi, *im_twi;
const float *re_u_i, *im_u_i, *re_l_i, *im_l_i;
float *re_u_o, *im_u_o, *re_l_o, *im_l_o;

const float *i_buf;
float *o_buf;
i_buf = src;
o_buf = buf;

// Is there a more efficient way to calculate these pointers?
for (uint32_t group = 0; group < (1 << stage); group ++) {
// divide cores into 2^(stage) groups
if (cid < ((num_cores >> stage)*(group+1))) {
// stage 0: 1 group, 0;
// stage 1: 2 group, 0 : nfft/2;
// stage 2: 4 group, 0 : nfft/4 : 2*nfft/4 : 3*nfft/4;
// ......

// offset for different groups
uint32_t offset = (nfft >> stage) * group;
uint32_t idx = cid - (num_cores >> stage) * group;

// inputs pointer
re_u_i = i_buf + offset + idx * len;
re_l_i = re_u_i + (nfft >> (stage + 1));
im_u_i = re_u_i + nfft;
im_l_i = re_l_i + nfft;

// output pointer
re_u_o = o_buf + offset + idx * len;
re_l_o = re_u_o + (nfft >> (stage + 1));
im_u_o = re_u_o + nfft;
im_l_o = re_l_o + nfft;

// twiddle pointer
// twiddle will not need add group offset
// main will jump to next twiddle
// each group will have the same twiddle
re_twi = twi + idx * len;
im_twi = re_twi + ntwi;

// Once the core gets the pointer, it needs to leave the loop
break;
}
}

// initial value of avl has been calculated earlier
for (; avl > 0; avl -= vl) {
// re_u_o = re_u_i + re_l_i;
// im_u_o = im_u_i + im_l_i;
// re_l_o = (re_u_i - re_l_i) * re_twi - (im_u_i - im_l_i) * im_twi;
// im_l_o = (re_u_i - re_l_i) * im_twi + (im_u_i - im_l_i) * re_twi;
asm volatile("vsetvli %0, %1, e32, m4, ta, ma" : "=r"(vl) : "r"(avl));

asm volatile("vle32.v v0, (%0);" ::"r"(re_u_i)); // v0: Re upper wing
re_u_i += vl;
asm volatile("vle32.v v4, (%0);" ::"r"(re_l_i)); // v4: Re lower wing
re_l_i += vl;
asm volatile("vfadd.vv v16, v0, v4"); // v16: Re butterfly output upper wing
asm volatile("vfsub.vv v0, v0, v4"); // v0: Re butterfly output upper wing

asm volatile("vle32.v v8, (%0);" ::"r"(im_u_i)); // v8: Im upper wing
im_u_i += vl;
asm volatile("vle32.v v12, (%0);" ::"r"(im_l_i)); // v12: Im lower wing
im_l_i += vl;

asm volatile("vfadd.vv v20, v8, v12"); // v20: Im butterfly output upper wing

asm volatile("vfsub.vv v4, v8, v12"); // v4: Im butterfly output upper wing

// Load the twiddle vector
asm volatile("vle32.v v8, (%0);" ::"r"(re_twi)); // v8: Re twi
re_twi += vl;
asm volatile("vle32.v v12, (%0);" ::"r"(im_twi)); // v12: Im twi
im_twi += vl;

// Twiddle the lower wing
// re_l_o = - v0 * v8 - v4 * v12
// im_l_o = v0 * v12 + v4 * v8
// Store 1:1 the output result
// Sequence do not need to shuffle in first phase
asm volatile("vfmul.vv v24, v0, v8");
asm volatile("vfnmsac.vv v24, v4, v12"); // v24: Re butterfly output
// twiddled lower wing
asm volatile("vse32.v v16, (%0)" ::"r"(re_u_o));
re_u_o += vl;
asm volatile("vse32.v v20, (%0)" ::"r"(im_u_o));
im_u_o += vl;
asm volatile("vfmul.vv v28, v0, v12");
asm volatile("vfmacc.vv v28, v4, v8"); // v28: Im butterfly output
// twiddled lower wing

asm volatile("vse32.v v24, (%0)" ::"r"(re_l_o));
re_l_o += vl;
asm volatile("vse32.v v28, (%0)" ::"r"(im_l_o));
im_l_o += vl;
}
}

/* Phase 2 of vector FFT on muti-core */
// DIF Cooley-Tukey algorithm
// At every iteration, we store indexed
void fft_p2(float *s, float *buf, const float *twi, float *out,
const uint16_t *seq_idx, const uint32_t nfft,
const uint32_t nfft_ori, const uint32_t log2_nfft,
const uint32_t stride, const uint32_t stride_e, const uint32_t ntwi) {

// Always run in dual-core mode

// Real part of the twiddles
const float *re_twi = twi;
// Img part of the twiddles
const float *im_twi = twi + ntwi;

// Keep half of the samples in a vector register
size_t avl;
size_t vl;

// Loop through the butterfly stages
for (uint32_t bf = 0; bf < log2_nfft; ++bf) {
// Keep half of the samples in a vector register
avl = nfft >> 1;
// Swap between the two buffers
const float *i_buf;
float *o_buf;
i_buf = !(bf & 1) ? buf : s;
o_buf = !(bf & 1) ? s : buf;

// Last iteration
if (bf == log2_nfft - 1)
o_buf = buf;

// Update pointers
const float *re_u_i = i_buf;
const float *im_u_i = i_buf + nfft_ori;
const float *re_l_i = re_u_i + (nfft >> 1);
const float *im_l_i = im_u_i + (nfft >> 1);
float *re_u_o = o_buf;
float *im_u_o = o_buf + nfft_ori;
float *re_l_o = re_u_o + (nfft >> 1);
float *im_l_o = im_u_o + (nfft >> 1);

float *re_u_s = out;
float *im_u_s = out + nfft_ori;
float *re_l_s = re_u_s + (nfft_ori >> 1);
float *im_l_s = im_u_s + (nfft_ori >> 1);

// Stripmine the whole vector for this butterfly stage
for (; avl > 0; avl -= vl) {
// re_u_o = re_u_i + re_l_i;
// im_u_o = im_u_i + im_l_i;
// re_l_o = (re_u_i - re_l_i) * re_twi - (im_u_i - im_l_i) * im_twi;
// im_l_o = (re_u_i - re_l_i) * im_twi + (im_u_i - im_l_i) * re_twi;
// Stripmine
// Group 4 registers as a larger register to improve performance
asm volatile("vsetvli %0, %1, e32, m4, ta, ma" : "=r"(vl) : "r"(avl));

// 2 load/store with 2 calc sometimes gives a better performance (window of 4 insn)
asm volatile("vle32.v v0, (%0);" ::"r"(re_u_i)); // v0: Re upper wing
re_u_i += vl;
asm volatile("vle32.v v4, (%0);" ::"r"(re_l_i)); // v4: Re lower wing
re_l_i += vl;

asm volatile("vfadd.vv v16, v0, v4"); // v16: Re butterfly output upper wing
asm volatile("vfsub.vv v0, v0, v4"); // v0: Re butterfly output upper wing

asm volatile("vle32.v v8, (%0);" ::"r"(im_u_i)); // v8: Im upper wing
im_u_i += vl;
asm volatile("vle32.v v12, (%0);" ::"r"(im_l_i)); // v12: Im lower wing
im_l_i += vl;

asm volatile("vfadd.vv v20, v8, v12"); // v20: Im butterfly output upper wing
asm volatile("vfsub.vv v4, v8, v12"); // v4: Im butterfly output upper wing

// Load the index vector. If last step, do strided store
// Otherwise, it's the helper index for the permutations (this is a mask
// vector)
if (bf == log2_nfft - 1) {
// Last store is a strided pattern
// use strided store instead of index store to improve performance
asm volatile("vsse32.v v16, (%0), %1" ::"r"(re_u_s),"r"(stride));
asm volatile("vsse32.v v20, (%0), %1" ::"r"(im_u_s),"r"(stride));
asm volatile("vsse32.v v0, (%0), %1" ::"r"(re_l_s),"r"(stride));
asm volatile("vsse32.v v4, (%0), %1" ::"r"(im_l_s),"r"(stride));
// update the store pointer
re_u_s += (vl << stride_e);
im_u_s += (vl << stride_e);
re_l_s += (vl << stride_e);
im_l_s += (vl << stride_e);
} else {
// TODO: Actually, there is no need to st then ld
// Can we add an insn to shuffle the elem position?
// Load the twiddle vector
asm volatile("vle32.v v8, (%0);" ::"r"(re_twi)); // v8: Re twi
re_twi += vl;
asm volatile("vle32.v v12, (%0);" ::"r"(im_twi)); // v12: Im twi
im_twi += vl;

// Twiddle the lower wing
// re_l_o = - v0 * v8 - v4 * v12
// im_l_o = v0 * v12 + v4 * v8
asm volatile("vfmul.vv v24, v0, v8");
asm volatile("vfnmsac.vv v24, v4, v12"); // v24: Re butterfly output
// twiddled lower wing
asm volatile("vfmul.vv v28, v0, v12");
asm volatile("vfmacc.vv v28, v4, v8"); // v28: Im butterfly output
// twiddled lower wing
// Load the sequential indices dirctly
asm volatile("vle16.v v12, (%0)" ::"r"(seq_idx)); // v24: index vector
seq_idx += vl;
re_u_o = o_buf;
im_u_o = o_buf + nfft_ori;
re_l_o = re_u_o + (nfft >> 2);
im_l_o = im_u_o + (nfft >> 2);

asm volatile("vsuxei16.v v16, (%0), v12" ::"r"(re_u_o));
asm volatile("vsuxei16.v v20, (%0), v12" ::"r"(im_u_o));
asm volatile("vsuxei16.v v24, (%0), v12" ::"r"(re_l_o));
asm volatile("vsuxei16.v v28, (%0), v12" ::"r"(im_l_o));
}
}
}
}
Loading
Loading