Skip to content

Commit de8f01c

Browse files
authored
model : wire up Nemotron-H tensors for NVFP4 support (ggml-org#20561)
* wire up Nemotron-H tensors for NVFP4 support * add ssm tensors * alignment
1 parent 079e5a4 commit de8f01c

4 files changed

Lines changed: 18 additions & 11 deletions

File tree

src/llama-model.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7501,6 +7501,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
75017501
}
75027502

75037503
// recurrent / linear-attention weight scales (per-tensor, shape {1})
7504+
if (!layer.ssm_in_s && layer.ssm_in) {
7505+
layer.ssm_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7506+
}
75047507
if (!layer.ssm_out_s && layer.ssm_out) {
75057508
layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
75067509
}

src/llama-model.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,8 @@ struct llama_layer {
409409
struct ggml_tensor * ffn_gate_shexp_s = nullptr;
410410
struct ggml_tensor * ffn_up_shexp_s = nullptr;
411411
struct ggml_tensor * ffn_down_shexp_s = nullptr;
412-
struct ggml_tensor * ssm_out_s = nullptr;
412+
struct ggml_tensor * ssm_in_s = nullptr;
413+
struct ggml_tensor * ssm_out_s = nullptr;
413414
struct ggml_tensor * ssm_alpha_s = nullptr;
414415
struct ggml_tensor * ssm_beta_s = nullptr;
415416

src/models/mamba-base.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
4242
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
4343

4444
// {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
45-
ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
45+
ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur, layer.ssm_in_s);
4646
// split the above in two
4747
// => {d_inner, n_seq_tokens, n_seqs}
4848
ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
@@ -137,7 +137,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
137137
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
138138

139139
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
140-
cur = build_lora_mm(layer.ssm_out, y);
140+
cur = build_lora_mm(layer.ssm_out, y, layer.ssm_out_s);
141141
}
142142

143143
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
@@ -184,7 +184,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
184184
// d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
185185

186186
// {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
187-
ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
187+
ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur, model.layers[il].ssm_in_s);
188188

189189
// split the above in three
190190
ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * zxBCdt->nb[0],
@@ -278,7 +278,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
278278
y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
279279

280280
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
281-
cur = build_lora_mm(model.layers[il].ssm_out, y);
281+
cur = build_lora_mm(model.layers[il].ssm_out, y, model.layers[il].ssm_out_s);
282282
}
283283

284284
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}

src/models/nemotron-h.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -107,9 +107,9 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
107107
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il) {
108108
if (model.layers[il].ffn_gate_inp == nullptr) {
109109
cur = build_ffn(cur,
110-
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
110+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up_s,
111111
NULL, NULL, NULL,
112-
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
112+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s,
113113
NULL,
114114
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
115115
cb(cur, "ffn_out", il);
@@ -136,17 +136,20 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
136136
hparams.expert_weights_scale,
137137
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
138138
il,
139-
router_logits);
139+
router_logits, nullptr,
140+
model.layers[il].ffn_up_exps_s,
141+
nullptr, // no gate
142+
model.layers[il].ffn_down_exps_s);
140143
cb(moe_out, "ffn_moe_out", il);
141144

142145
if (model.layers[il].ffn_latent_up) {
143146
moe_out = ggml_mul_mat(ctx0, model.layers[il].ffn_latent_up, moe_out);
144147
}
145148

146149
ggml_tensor * ffn_shexp = build_ffn(inp_emb,
147-
model.layers[il].ffn_up_shexp, NULL, NULL,
148-
NULL /* no gate */ , NULL, NULL,
149-
model.layers[il].ffn_down_shexp, NULL, NULL,
150+
model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s,
151+
NULL /* no gate */ , NULL, NULL,
152+
model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s,
150153
NULL,
151154
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
152155
cb(ffn_shexp, "ffn_shexp", il);

0 commit comments

Comments
 (0)