@@ -4666,126 +4666,6 @@ struct llm_build_context {
4666
4666
ctx0 = nullptr;
4667
4667
}
4668
4668
}
4669
- struct ggml_cgraph * build_orion() {
4670
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4671
-
4672
- const int64_t n_embd_head = hparams.n_embd_head_v;
4673
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4674
- GGML_ASSERT(n_embd_head == hparams.n_rot);
4675
-
4676
- struct ggml_tensor * cur;
4677
- struct ggml_tensor * inpL;
4678
-
4679
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
4680
- cb(inpL, "inp_embd", -1);
4681
-
4682
- // inp_pos - contains the positions
4683
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
4684
- cb(inp_pos, "inp_pos", -1);
4685
-
4686
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4687
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
4688
- cb(KQ_mask, "KQ_mask", -1);
4689
-
4690
- // shift the entire K-cache if needed
4691
- if (do_rope_shift) {
4692
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4693
- }
4694
-
4695
- for (int il = 0; il < n_layer; ++il) {
4696
- struct ggml_tensor * inpSA = inpL;
4697
-
4698
- // norm
4699
- cur = llm_build_norm(ctx0, inpL, hparams,
4700
- model.layers[il].attn_norm, model.layers[il].attn_norm_b,
4701
- LLM_NORM, cb, il);
4702
- cb(cur, "attn_norm", il);
4703
-
4704
- // self-attention
4705
- {
4706
- // compute Q and K and RoPE them
4707
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4708
- cb(Qcur, "Qcur", il);
4709
- // if (model.layers[il].bq) {
4710
- // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
4711
- // cb(Qcur, "Qcur", il);
4712
- // }
4713
-
4714
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4715
- cb(Kcur, "Kcur", il);
4716
- // if (model.layers[il].bk) {
4717
- // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
4718
- // cb(Kcur, "Kcur", il);
4719
- // }
4720
-
4721
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
4722
- cb(Vcur, "Vcur", il);
4723
- // if (model.layers[il].bv) {
4724
- // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
4725
- // cb(Vcur, "Vcur", il);
4726
- // }
4727
-
4728
- Qcur = ggml_rope_custom(
4729
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4730
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4731
- ext_factor, attn_factor, beta_fast, beta_slow
4732
- );
4733
- cb(Qcur, "Qcur", il);
4734
-
4735
- Kcur = ggml_rope_custom(
4736
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4737
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4738
- ext_factor, attn_factor, beta_fast, beta_slow
4739
- );
4740
- cb(Kcur, "Kcur", il);
4741
-
4742
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
4743
- model.layers[il].wo, NULL,
4744
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4745
- cb(cur, "kqv_out", il);
4746
- }
4747
-
4748
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4749
- cb(ffn_inp, "ffn_inp", il);
4750
-
4751
- // feed-forward network
4752
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
4753
- model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
4754
- LLM_NORM, cb, il);
4755
- cb(cur, "ffn_norm", il);
4756
-
4757
- cur = llm_build_ffn(ctx0, cur,
4758
- model.layers[il].ffn_up, NULL,
4759
- model.layers[il].ffn_gate, NULL,
4760
- model.layers[il].ffn_down, NULL,
4761
- NULL,
4762
- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4763
- cb(cur, "ffn_out", il);
4764
-
4765
- cur = ggml_add(ctx0, cur, ffn_inp);
4766
- cb(cur, "l_out", il);
4767
-
4768
- // input for next layer
4769
- inpL = cur;
4770
- }
4771
-
4772
- cur = inpL;
4773
-
4774
- cur = llm_build_norm(ctx0, cur, hparams,
4775
- model.output_norm, model.output_norm_b,
4776
- LLM_NORM, cb, -1);
4777
- cb(cur, "result_norm", -1);
4778
-
4779
- // lm_head
4780
- cur = ggml_mul_mat(ctx0, model.output, cur);
4781
- cb(cur, "result_output", -1);
4782
-
4783
- ggml_build_forward_expand(gf, cur);
4784
-
4785
- return gf;
4786
- }
4787
-
4788
-
4789
4669
4790
4670
struct ggml_cgraph * build_llama() {
4791
4671
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@@ -6589,6 +6469,125 @@ struct llm_build_context {
6589
6469
6590
6470
return gf;
6591
6471
}
6472
+
6473
+ struct ggml_cgraph * build_orion() {
6474
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6475
+
6476
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6477
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6478
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6479
+
6480
+ struct ggml_tensor * cur;
6481
+ struct ggml_tensor * inpL;
6482
+
6483
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6484
+ cb(inpL, "inp_embd", -1);
6485
+
6486
+ // inp_pos - contains the positions
6487
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6488
+ cb(inp_pos, "inp_pos", -1);
6489
+
6490
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6491
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6492
+ cb(KQ_mask, "KQ_mask", -1);
6493
+
6494
+ // shift the entire K-cache if needed
6495
+ if (do_rope_shift) {
6496
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6497
+ }
6498
+
6499
+ for (int il = 0; il < n_layer; ++il) {
6500
+ struct ggml_tensor * inpSA = inpL;
6501
+
6502
+ // norm
6503
+ cur = llm_build_norm(ctx0, inpL, hparams,
6504
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
6505
+ LLM_NORM, cb, il);
6506
+ cb(cur, "attn_norm", il);
6507
+
6508
+ // self-attention
6509
+ {
6510
+ // compute Q and K and RoPE them
6511
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6512
+ cb(Qcur, "Qcur", il);
6513
+ // if (model.layers[il].bq) {
6514
+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6515
+ // cb(Qcur, "Qcur", il);
6516
+ // }
6517
+
6518
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6519
+ cb(Kcur, "Kcur", il);
6520
+ // if (model.layers[il].bk) {
6521
+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6522
+ // cb(Kcur, "Kcur", il);
6523
+ // }
6524
+
6525
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6526
+ cb(Vcur, "Vcur", il);
6527
+ // if (model.layers[il].bv) {
6528
+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6529
+ // cb(Vcur, "Vcur", il);
6530
+ // }
6531
+
6532
+ Qcur = ggml_rope_custom(
6533
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6534
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6535
+ ext_factor, attn_factor, beta_fast, beta_slow
6536
+ );
6537
+ cb(Qcur, "Qcur", il);
6538
+
6539
+ Kcur = ggml_rope_custom(
6540
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6541
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6542
+ ext_factor, attn_factor, beta_fast, beta_slow
6543
+ );
6544
+ cb(Kcur, "Kcur", il);
6545
+
6546
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6547
+ model.layers[il].wo, NULL,
6548
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6549
+ cb(cur, "kqv_out", il);
6550
+ }
6551
+
6552
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6553
+ cb(ffn_inp, "ffn_inp", il);
6554
+
6555
+ // feed-forward network
6556
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6557
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
6558
+ LLM_NORM, cb, il);
6559
+ cb(cur, "ffn_norm", il);
6560
+
6561
+ cur = llm_build_ffn(ctx0, cur,
6562
+ model.layers[il].ffn_up, NULL,
6563
+ model.layers[il].ffn_gate, NULL,
6564
+ model.layers[il].ffn_down, NULL,
6565
+ NULL,
6566
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6567
+ cb(cur, "ffn_out", il);
6568
+
6569
+ cur = ggml_add(ctx0, cur, ffn_inp);
6570
+ cb(cur, "l_out", il);
6571
+
6572
+ // input for next layer
6573
+ inpL = cur;
6574
+ }
6575
+
6576
+ cur = inpL;
6577
+
6578
+ cur = llm_build_norm(ctx0, cur, hparams,
6579
+ model.output_norm, model.output_norm_b,
6580
+ LLM_NORM, cb, -1);
6581
+ cb(cur, "result_norm", -1);
6582
+
6583
+ // lm_head
6584
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6585
+ cb(cur, "result_output", -1);
6586
+
6587
+ ggml_build_forward_expand(gf, cur);
6588
+
6589
+ return gf;
6590
+ }
6592
6591
};
6593
6592
6594
6593
static struct ggml_cgraph * llama_build_graph(
0 commit comments