17
17
#include " common.h"
18
18
#include " json.hpp"
19
19
#include " llama.h"
20
- #include " grammar-parser.h"
21
20
#include " backend.pb.h"
22
21
#include " backend.grpc.pb.h"
23
22
#include " utils.hpp"
@@ -203,8 +202,8 @@ struct llama_client_slot
203
202
std::string stopping_word;
204
203
205
204
// sampling
206
- struct llama_sampling_params sparams;
207
- llama_sampling_context *ctx_sampling = nullptr ;
205
+ struct gpt_sampler_params sparams;
206
+ gpt_sampler *ctx_sampling = nullptr ;
208
207
209
208
int32_t ga_i = 0 ; // group-attention state
210
209
int32_t ga_n = 1 ; // group-attention factor
@@ -619,7 +618,7 @@ struct llama_server_context
619
618
620
619
bool launch_slot_with_data (llama_client_slot* &slot, json data) {
621
620
slot_params default_params;
622
- llama_sampling_params default_sparams;
621
+ gpt_sampler_params default_sparams;
623
622
624
623
slot->params .stream = json_value (data, " stream" , false );
625
624
slot->params .cache_prompt = json_value (data, " cache_prompt" , false );
@@ -628,7 +627,7 @@ struct llama_server_context
628
627
slot->sparams .top_p = json_value (data, " top_p" , default_sparams.top_p );
629
628
slot->sparams .min_p = json_value (data, " min_p" , default_sparams.min_p );
630
629
slot->sparams .tfs_z = json_value (data, " tfs_z" , default_sparams.tfs_z );
631
- slot->sparams .typical_p = json_value (data, " typical_p" , default_sparams.typical_p );
630
+ slot->sparams .typ_p = json_value (data, " typical_p" , default_sparams.typ_p );
632
631
slot->sparams .temp = json_value (data, " temperature" , default_sparams.temp );
633
632
slot->sparams .dynatemp_range = json_value (data, " dynatemp_range" , default_sparams.dynatemp_range );
634
633
slot->sparams .dynatemp_exponent = json_value (data, " dynatemp_exponent" , default_sparams.dynatemp_exponent );
@@ -641,7 +640,7 @@ struct llama_server_context
641
640
slot->sparams .mirostat_eta = json_value (data, " mirostat_eta" , default_sparams.mirostat_eta );
642
641
slot->sparams .penalize_nl = json_value (data, " penalize_nl" , default_sparams.penalize_nl );
643
642
slot->params .n_keep = json_value (data, " n_keep" , slot->params .n_keep );
644
- slot->params .seed = json_value (data, " seed" , default_params .seed );
643
+ slot->sparams .seed = json_value (data, " seed" , default_sparams .seed );
645
644
slot->sparams .grammar = json_value (data, " grammar" , default_sparams.grammar );
646
645
slot->sparams .n_probs = json_value (data, " n_probs" , default_sparams.n_probs );
647
646
slot->sparams .min_keep = json_value (data, " min_keep" , default_sparams.min_keep );
@@ -665,6 +664,7 @@ struct llama_server_context
665
664
slot->params .input_prefix = " " ;
666
665
}
667
666
667
+
668
668
if (data.count (" input_suffix" ) != 0 )
669
669
{
670
670
slot->params .input_suffix = data[" input_suffix" ];
@@ -683,6 +683,10 @@ struct llama_server_context
683
683
slot->prompt = " " ;
684
684
}
685
685
686
+ if (json_value (data, " ignore_eos" , false )) {
687
+ slot->sparams .logit_bias .push_back ({llama_token_eos (model), -INFINITY});
688
+ }
689
+ /*
686
690
slot->sparams.penalty_prompt_tokens.clear();
687
691
slot->sparams.use_penalty_prompt_tokens = false;
688
692
const auto &penalty_prompt = data.find("penalty_prompt");
@@ -718,14 +722,10 @@ struct llama_server_context
718
722
slot->sparams.use_penalty_prompt_tokens = true;
719
723
}
720
724
}
725
+ */
721
726
722
727
slot->sparams .logit_bias .clear ();
723
728
724
- if (json_value (data, " ignore_eos" , false ))
725
- {
726
- slot->sparams .logit_bias [llama_token_eos (model)] = -INFINITY;
727
- }
728
-
729
729
const auto &logit_bias = data.find (" logit_bias" );
730
730
if (logit_bias != data.end () && logit_bias->is_array ())
731
731
{
@@ -753,21 +753,21 @@ struct llama_server_context
753
753
llama_token tok = el[0 ].get <llama_token>();
754
754
if (tok >= 0 && tok < n_vocab)
755
755
{
756
- slot->sparams .logit_bias [ tok] = bias;
756
+ slot->sparams .logit_bias . push_back ({ tok, bias}) ;
757
757
}
758
758
}
759
759
else if (el[0 ].is_string ())
760
760
{
761
761
auto toks = llama_tokenize (model, el[0 ].get <std::string>(), false );
762
762
for (auto tok : toks)
763
763
{
764
- slot->sparams .logit_bias [ tok] = bias;
764
+ slot->sparams .logit_bias . push_back ({ tok, bias}) ;
765
765
}
766
766
}
767
767
}
768
768
}
769
769
}
770
-
770
+
771
771
slot->params .antiprompt .clear ();
772
772
773
773
const auto &stop = data.find (" stop" );
@@ -781,24 +781,22 @@ struct llama_server_context
781
781
}
782
782
}
783
783
}
784
-
785
- const auto &samplers_sequence = data.find (" samplers" );
786
- if (samplers_sequence != data.end () && samplers_sequence->is_array ())
787
- {
784
+
785
+ const auto & samplers = data.find (" samplers" );
786
+ if (samplers != data.end () && samplers->is_array ()) {
788
787
std::vector<std::string> sampler_names;
789
- for (const auto &sampler_name : *samplers_sequence)
790
- {
791
- if (sampler_name.is_string ())
792
- {
793
- sampler_names.emplace_back (sampler_name);
788
+ for (const auto & name : *samplers) {
789
+ if (name.is_string ()) {
790
+ sampler_names.emplace_back (name);
791
+ }
794
792
}
795
- }
796
- slot->sparams .samplers_sequence = llama_sampling_types_from_names (sampler_names, false );
793
+ slot->sparams .samplers = gpt_sampler_types_from_names (sampler_names, false );
797
794
}
798
795
else
799
796
{
800
- slot->sparams .samplers_sequence = default_sparams.samplers_sequence ;
797
+ slot->sparams .samplers = default_sparams.samplers ;
801
798
}
799
+
802
800
803
801
if (multimodal)
804
802
{
@@ -875,10 +873,10 @@ struct llama_server_context
875
873
876
874
if (slot->ctx_sampling != nullptr )
877
875
{
878
- llama_sampling_free (slot->ctx_sampling );
876
+ gpt_sampler_free (slot->ctx_sampling );
879
877
}
880
- slot->ctx_sampling = llama_sampling_init ( slot->sparams );
881
- llama_set_rng_seed (ctx, slot->params .seed );
878
+ slot->ctx_sampling = gpt_sampler_init (model, slot->sparams );
879
+ // llama_set_rng_seed(ctx, slot->params.seed);
882
880
slot->command = LOAD_PROMPT;
883
881
884
882
all_slots_are_idle = false ;
@@ -888,7 +886,7 @@ struct llama_server_context
888
886
{" task_id" , slot->task_id },
889
887
});
890
888
891
- LOG_TEE (" sampling: \n %s\n " , llama_sampling_print (slot->sparams ).c_str ());
889
+ // LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
892
890
893
891
return true ;
894
892
}
@@ -1006,11 +1004,13 @@ struct llama_server_context
1006
1004
slot.generated_text += token_str;
1007
1005
slot.has_next_token = true ;
1008
1006
1007
+ /*
1009
1008
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
1010
1009
{
1011
1010
// we can change penalty_prompt_tokens because it is always created from scratch each request
1012
1011
slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
1013
1012
}
1013
+ */
1014
1014
1015
1015
// check if there is incomplete UTF-8 character at the end
1016
1016
bool incomplete = false ;
@@ -1144,13 +1144,11 @@ struct llama_server_context
1144
1144
1145
1145
json get_formated_generation (llama_client_slot &slot)
1146
1146
{
1147
- const auto eos_bias = slot.sparams .logit_bias .find (llama_token_eos (model));
1148
- const bool ignore_eos = eos_bias != slot.sparams .logit_bias .end () &&
1149
- eos_bias->second < 0 .0f && std::isinf (eos_bias->second );
1150
- std::vector<std::string> samplers_sequence;
1151
- for (const auto &sampler_type : slot.sparams .samplers_sequence )
1147
+ std::vector<std::string> samplers;
1148
+ samplers.reserve (slot.sparams .samplers .size ());
1149
+ for (const auto & sampler : slot.sparams .samplers )
1152
1150
{
1153
- samplers_sequence .emplace_back (llama_sampling_type_to_str (sampler_type ));
1151
+ samplers .emplace_back (gpt_sampler_type_to_str (sampler ));
1154
1152
}
1155
1153
1156
1154
return json {
@@ -1165,27 +1163,25 @@ struct llama_server_context
1165
1163
{" top_p" , slot.sparams .top_p },
1166
1164
{" min_p" , slot.sparams .min_p },
1167
1165
{" tfs_z" , slot.sparams .tfs_z },
1168
- {" typical_p" , slot.sparams .typical_p },
1166
+ {" typical_p" , slot.sparams .typ_p },
1169
1167
{" repeat_last_n" , slot.sparams .penalty_last_n },
1170
1168
{" repeat_penalty" , slot.sparams .penalty_repeat },
1171
1169
{" presence_penalty" , slot.sparams .penalty_present },
1172
1170
{" frequency_penalty" , slot.sparams .penalty_freq },
1173
- {" penalty_prompt_tokens" , slot.sparams .penalty_prompt_tokens },
1174
- {" use_penalty_prompt_tokens" , slot.sparams .use_penalty_prompt_tokens },
1175
1171
{" mirostat" , slot.sparams .mirostat },
1176
1172
{" mirostat_tau" , slot.sparams .mirostat_tau },
1177
1173
{" mirostat_eta" , slot.sparams .mirostat_eta },
1178
1174
{" penalize_nl" , slot.sparams .penalize_nl },
1179
1175
{" stop" , slot.params .antiprompt },
1180
1176
{" n_predict" , slot.params .n_predict },
1181
1177
{" n_keep" , params.n_keep },
1182
- {" ignore_eos" , ignore_eos},
1178
+ {" ignore_eos" , slot. sparams . ignore_eos },
1183
1179
{" stream" , slot.params .stream },
1184
- {" logit_bias" , slot.sparams .logit_bias },
1180
+ // {"logit_bias", slot.sparams.logit_bias},
1185
1181
{" n_probs" , slot.sparams .n_probs },
1186
1182
{" min_keep" , slot.sparams .min_keep },
1187
1183
{" grammar" , slot.sparams .grammar },
1188
- {" samplers" , samplers_sequence }
1184
+ {" samplers" , samplers }
1189
1185
};
1190
1186
}
1191
1187
@@ -1714,7 +1710,7 @@ struct llama_server_context
1714
1710
1715
1711
if (!slot.params .cache_prompt )
1716
1712
{
1717
- llama_sampling_reset (slot.ctx_sampling );
1713
+ gpt_sampler_reset (slot.ctx_sampling );
1718
1714
1719
1715
slot.n_past = 0 ;
1720
1716
slot.n_past_se = 0 ;
@@ -1726,7 +1722,7 @@ struct llama_server_context
1726
1722
// push the prompt into the sampling context (do not apply grammar)
1727
1723
for (auto &token : prompt_tokens)
1728
1724
{
1729
- llama_sampling_accept (slot.ctx_sampling , ctx , token, false );
1725
+ gpt_sampler_accept (slot.ctx_sampling , token, false );
1730
1726
}
1731
1727
1732
1728
slot.n_past = common_part (slot.cache_tokens , prompt_tokens);
@@ -1934,9 +1930,9 @@ struct llama_server_context
1934
1930
}
1935
1931
1936
1932
completion_token_output result;
1937
- const llama_token id = llama_sampling_sample (slot.ctx_sampling , ctx, NULL , slot.i_batch - i);
1933
+ const llama_token id = gpt_sampler_sample (slot.ctx_sampling , ctx, slot.i_batch - i);
1938
1934
1939
- llama_sampling_accept (slot.ctx_sampling , ctx , id, true );
1935
+ gpt_sampler_accept (slot.ctx_sampling , id, true );
1940
1936
1941
1937
slot.n_decoded += 1 ;
1942
1938
if (slot.n_decoded == 1 )
@@ -1946,19 +1942,14 @@ struct llama_server_context
1946
1942
metrics.on_prompt_eval (slot);
1947
1943
}
1948
1944
1949
- llama_token_data_array cur_p = { slot.ctx_sampling ->cur .data (), slot.ctx_sampling ->cur .size (), false };
1950
1945
result.tok = id;
1946
+ const auto * cur_p = gpt_sampler_get_candidates (slot.ctx_sampling );
1951
1947
1952
- const int32_t n_probs = slot.sparams .n_probs ;
1953
- if (slot.sparams .temp <= 0 && n_probs > 0 )
1954
- {
1955
- // for llama_sample_token_greedy we need to sort candidates
1956
- llama_sample_softmax (ctx, &cur_p);
1957
- }
1958
-
1959
- for (size_t i = 0 ; i < std::min (cur_p.size , (size_t )n_probs); ++i)
1960
- {
1961
- result.probs .push_back ({cur_p.data [i].id , cur_p.data [i].p });
1948
+ for (size_t i = 0 ; i < (size_t ) slot.sparams .n_probs ; ++i) {
1949
+ result.probs .push_back ({
1950
+ cur_p->data [i].id ,
1951
+ i >= cur_p->size ? 0 .0f : cur_p->data [i].p ,
1952
+ });
1962
1953
}
1963
1954
1964
1955
if (!process_token (result, slot))
0 commit comments