@@ -90,11 +90,11 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_qkvz(
9090 const int64_t n_seqs = ubatch.n_seqs ;
9191 const int64_t n_seq_tokens = ubatch.n_seq_tokens ;
9292
93- ggml_tensor * qkv_mixed = build_lora_mm (model.layers [il].wqkv , input);
93+ ggml_tensor * qkv_mixed = build_lora_mm (model.layers [il].wqkv , input, model. layers [il]. wqkv_s );
9494 qkv_mixed = ggml_reshape_3d (ctx0, qkv_mixed, qkv_mixed->ne [0 ], n_seq_tokens, n_seqs);
9595 cb (qkv_mixed, " linear_attn_qkv_mixed" , il);
9696
97- ggml_tensor * z = build_lora_mm (model.layers [il].wqkv_gate , input);
97+ ggml_tensor * z = build_lora_mm (model.layers [il].wqkv_gate , input, model. layers [il]. wqkv_gate_s );
9898 cb (z, " z" , il);
9999
100100 return { qkv_mixed, z };
@@ -123,7 +123,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
123123 // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
124124
125125 // Qwen3Next uses a single Q projection that outputs query + gate
126- ggml_tensor * Qcur_full = build_lora_mm (model.layers [il].wq , cur); // [ (n_embd_head * 2) * n_head, n_tokens ]
126+ ggml_tensor * Qcur_full = build_lora_mm (model.layers [il].wq , cur, model. layers [il]. wq_s ); // [ (n_embd_head * 2) * n_head, n_tokens ]
127127 cb (Qcur_full, " Qcur_full" , il);
128128
129129 ggml_tensor * Qcur = ggml_view_3d (ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
@@ -135,10 +135,10 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
135135 Qcur = build_norm (Qcur, model.layers [il].attn_q_norm , nullptr , LLM_NORM_RMS, il);
136136 cb (Qcur, " Qcur_normed" , il);
137137
138- ggml_tensor * Kcur = build_lora_mm (model.layers [il].wk , cur);
138+ ggml_tensor * Kcur = build_lora_mm (model.layers [il].wk , cur, model. layers [il]. wk_s );
139139 cb (Kcur, " Kcur" , il);
140140
141- ggml_tensor * Vcur = build_lora_mm (model.layers [il].wv , cur);
141+ ggml_tensor * Vcur = build_lora_mm (model.layers [il].wv , cur, model. layers [il]. wv_s );
142142 cb (Vcur, " Vcur" , il);
143143
144144 // Apply K normalization
@@ -186,7 +186,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
186186 cur = ggml_mul (ctx0, cur, gate_sigmoid);
187187 cb (cur, " attn_gated" , il);
188188
189- cur = build_lora_mm (model.layers [il].wo , cur);
189+ cur = build_lora_mm (model.layers [il].wo , cur, model. layers [il]. wo_s );
190190 cb (cur, " attn_output" , il);
191191
192192 return cur;
@@ -217,13 +217,13 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
217217 ggml_tensor * qkv_mixed = qkvz.first ;
218218 ggml_tensor * z = qkvz.second ;
219219
220- ggml_tensor * beta = build_lora_mm (model.layers [il].ssm_beta , cur);
220+ ggml_tensor * beta = build_lora_mm (model.layers [il].ssm_beta , cur, model. layers [il]. ssm_beta_s );
221221 beta = ggml_reshape_4d (ctx0, beta, 1 , num_v_heads, n_seq_tokens, n_seqs);
222222 cb (beta, " beta" , il);
223223
224224 beta = ggml_sigmoid (ctx0, beta);
225225
226- ggml_tensor * alpha = build_lora_mm (model.layers [il].ssm_alpha , cur);
226+ ggml_tensor * alpha = build_lora_mm (model.layers [il].ssm_alpha , cur, model. layers [il]. ssm_alpha_s );
227227 alpha = ggml_cont_3d (ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
228228 cb (alpha, " alpha" , il);
229229
@@ -356,7 +356,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
356356 cb (final_output, " final_output" , il);
357357
358358 // Output projection
359- cur = build_lora_mm (model.layers [il].ssm_out , final_output);
359+ cur = build_lora_mm (model.layers [il].ssm_out , final_output, model. layers [il]. ssm_out_s );
360360 cb (cur, " linear_attn_out" , il);
361361
362362 // Reshape back to original dimensions
@@ -380,16 +380,19 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_ffn(ggml_tensor * cur, const int
380380 LLM_FFN_SILU, true ,
381381 hparams.expert_weights_scale ,
382382 LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
383- nullptr , model.layers [il].ffn_gate_up_exps );
383+ nullptr , model.layers [il].ffn_gate_up_exps ,
384+ model.layers [il].ffn_up_exps_s ,
385+ model.layers [il].ffn_gate_exps_s ,
386+ model.layers [il].ffn_down_exps_s );
384387 cb (moe_out, " ffn_moe_out" , il);
385388
386389 // Add shared experts if present - following Qwen3Next reference implementation
387390 if (model.layers [il].ffn_up_shexp != nullptr ) {
388391 ggml_tensor * ffn_shexp =
389392 build_ffn (cur,
390- model.layers [il].ffn_up_shexp , NULL , NULL ,
391- model.layers [il].ffn_gate_shexp , NULL , NULL ,
392- model.layers [il].ffn_down_shexp , NULL , NULL ,
393+ model.layers [il].ffn_up_shexp , NULL , model. layers [il]. ffn_up_shexp_s ,
394+ model.layers [il].ffn_gate_shexp , NULL , model. layers [il]. ffn_gate_shexp_s ,
395+ model.layers [il].ffn_down_shexp , NULL , model. layers [il]. ffn_down_shexp_s ,
393396 NULL ,
394397 LLM_FFN_SILU, LLM_FFN_PAR, il);
395398 cb (ffn_shexp, " ffn_shexp" , il);
0 commit comments