@@ -7354,11 +7354,11 @@ struct llm_build_gemma2 : public llm_graph_context {
7354
7354
};
7355
7355
7356
7356
struct llm_build_gemma3 : public llm_graph_context {
7357
- llm_build_gemma3 (const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7357
+ llm_build_gemma3 (const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7358
7358
const int64_t n_embd_head_k = hparams.n_embd_head_k ;
7359
7359
7360
- struct ggml_tensor * cur;
7361
- struct ggml_tensor * inpL;
7360
+ ggml_tensor * cur;
7361
+ ggml_tensor * inpL;
7362
7362
7363
7363
inpL = build_inp_embd (model.tok_embd );
7364
7364
@@ -7369,10 +7369,10 @@ struct llm_build_gemma3 : public llm_graph_context {
7369
7369
}
7370
7370
7371
7371
// inp_pos - contains the positions
7372
- struct ggml_tensor * inp_pos = build_inp_pos ();
7372
+ ggml_tensor * inp_pos = build_inp_pos ();
7373
7373
7374
7374
// TODO: is causal == true correct? might need some changes
7375
- auto inp_attn = build_attn_inp_kv_self (true , true );
7375
+ auto * inp_attn = build_attn_inp_kv_unified (true , true );
7376
7376
7377
7377
// "5-to-1 interleaved attention"
7378
7378
// 5 layers of local attention followed by 1 layer of global attention
@@ -7381,8 +7381,8 @@ struct llm_build_gemma3 : public llm_graph_context {
7381
7381
for (int il = 0 ; il < n_layer; ++il) {
7382
7382
const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1 );
7383
7383
7384
- const float freq_base_l = is_sliding ? 10000 .0f : freq_base;
7385
- const float freq_scale_l = is_sliding ? 1 .0f : freq_scale;
7384
+ const float freq_base_l = is_sliding ? 10000 .0f : freq_base;
7385
+ const float freq_scale_l = is_sliding ? 1 .0f : freq_scale;
7386
7386
7387
7387
// norm
7388
7388
cur = build_norm (inpL, model.layers [il].attn_norm , NULL , LLM_NORM_RMS, il);
@@ -7391,13 +7391,13 @@ struct llm_build_gemma3 : public llm_graph_context {
7391
7391
// self-attention
7392
7392
{
7393
7393
// compute Q and K and RoPE them
7394
- struct ggml_tensor * Qcur = build_lora_mm (model.layers [il].wq , cur);
7394
+ ggml_tensor * Qcur = build_lora_mm (model.layers [il].wq , cur);
7395
7395
cb (Qcur, " Qcur" , il);
7396
7396
7397
- struct ggml_tensor * Kcur = build_lora_mm (model.layers [il].wk , cur);
7397
+ ggml_tensor * Kcur = build_lora_mm (model.layers [il].wk , cur);
7398
7398
cb (Kcur, " Kcur" , il);
7399
7399
7400
- struct ggml_tensor * Vcur = build_lora_mm (model.layers [il].wv , cur);
7400
+ ggml_tensor * Vcur = build_lora_mm (model.layers [il].wv , cur);
7401
7401
cb (Vcur, " Vcur" , il);
7402
7402
7403
7403
Qcur = ggml_reshape_3d (ctx0, Qcur, n_embd_head_k, n_head, n_tokens);
@@ -7420,7 +7420,7 @@ struct llm_build_gemma3 : public llm_graph_context {
7420
7420
ext_factor, attn_factor, beta_fast, beta_slow);
7421
7421
cb (Kcur, " Kcur" , il);
7422
7422
7423
- cur = build_attn (inp_attn. get () , gf,
7423
+ cur = build_attn (inp_attn, gf,
7424
7424
model.layers [il].wo , NULL ,
7425
7425
Qcur, Kcur, Vcur, nullptr , hparams.f_attention_scale , il);
7426
7426
}
@@ -7432,12 +7432,12 @@ struct llm_build_gemma3 : public llm_graph_context {
7432
7432
7433
7433
if (il == n_layer - 1 ) {
7434
7434
// skip computing output for unused tokens
7435
- struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7435
+ ggml_tensor * inp_out_ids = build_inp_out_ids ();
7436
7436
cur = ggml_get_rows (ctx0, cur, inp_out_ids);
7437
7437
inpL = ggml_get_rows (ctx0, inpL, inp_out_ids);
7438
7438
}
7439
7439
7440
- struct ggml_tensor * sa_out = ggml_add (ctx0, cur, inpL);
7440
+ ggml_tensor * sa_out = ggml_add (ctx0, cur, inpL);
7441
7441
cb (sa_out, " sa_out" , il);
7442
7442
7443
7443
cur = build_norm (sa_out,
@@ -11017,7 +11017,7 @@ llm_graph_result_ptr llama_model::build_graph(
11017
11017
} break ;
11018
11018
case LLM_ARCH_GEMMA3:
11019
11019
{
11020
- llm = std::make_unique<llm_build_gemma3>(params, gf);
11020
+ llm = std::make_unique<llm_build_gemma3>(* this , params, gf);
11021
11021
} break ;
11022
11022
case LLM_ARCH_STARCODER2:
11023
11023
{
0 commit comments