Skip to content

Commit bb76578

Browse files
committed
chore(deps): update llama.cpp
Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent 5139dad commit bb76578

File tree

2 files changed

+49
-58
lines changed

2 files changed

+49
-58
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
88
# llama.cpp versions
99
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
1010
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
11-
CPPLLAMA_VERSION?=815b1fb20a53e439882171757825bacb1350de04
11+
CPPLLAMA_VERSION?=daa9623ab051a8162ae750b150b9522571b55f21
1212

1313
# go-rwkv version
1414
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp

backend/cpp/llama/grpc-server.cpp

Lines changed: 48 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
#include "common.h"
1818
#include "json.hpp"
1919
#include "llama.h"
20-
#include "grammar-parser.h"
2120
#include "backend.pb.h"
2221
#include "backend.grpc.pb.h"
2322
#include "utils.hpp"
@@ -203,8 +202,8 @@ struct llama_client_slot
203202
std::string stopping_word;
204203

205204
// sampling
206-
struct llama_sampling_params sparams;
207-
llama_sampling_context *ctx_sampling = nullptr;
205+
struct gpt_sampler_params sparams;
206+
gpt_sampler *ctx_sampling = nullptr;
208207

209208
int32_t ga_i = 0; // group-attention state
210209
int32_t ga_n = 1; // group-attention factor
@@ -619,7 +618,7 @@ struct llama_server_context
619618

620619
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
621620
slot_params default_params;
622-
llama_sampling_params default_sparams;
621+
gpt_sampler_params default_sparams;
623622

624623
slot->params.stream = json_value(data, "stream", false);
625624
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
@@ -628,7 +627,7 @@ struct llama_server_context
628627
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
629628
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
630629
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
631-
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
630+
slot->sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
632631
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
633632
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
634633
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
@@ -641,7 +640,7 @@ struct llama_server_context
641640
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
642641
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
643642
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
644-
slot->params.seed = json_value(data, "seed", default_params.seed);
643+
slot->sparams.seed = json_value(data, "seed", default_sparams.seed);
645644
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
646645
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
647646
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
@@ -665,6 +664,7 @@ struct llama_server_context
665664
slot->params.input_prefix = "";
666665
}
667666

667+
668668
if (data.count("input_suffix") != 0)
669669
{
670670
slot->params.input_suffix = data["input_suffix"];
@@ -683,6 +683,10 @@ struct llama_server_context
683683
slot->prompt = "";
684684
}
685685

686+
if (json_value(data, "ignore_eos", false)) {
687+
slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
688+
}
689+
/*
686690
slot->sparams.penalty_prompt_tokens.clear();
687691
slot->sparams.use_penalty_prompt_tokens = false;
688692
const auto &penalty_prompt = data.find("penalty_prompt");
@@ -718,14 +722,10 @@ struct llama_server_context
718722
slot->sparams.use_penalty_prompt_tokens = true;
719723
}
720724
}
725+
*/
721726

722727
slot->sparams.logit_bias.clear();
723728

724-
if (json_value(data, "ignore_eos", false))
725-
{
726-
slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
727-
}
728-
729729
const auto &logit_bias = data.find("logit_bias");
730730
if (logit_bias != data.end() && logit_bias->is_array())
731731
{
@@ -753,21 +753,21 @@ struct llama_server_context
753753
llama_token tok = el[0].get<llama_token>();
754754
if (tok >= 0 && tok < n_vocab)
755755
{
756-
slot->sparams.logit_bias[tok] = bias;
756+
slot->sparams.logit_bias.push_back({tok, bias});
757757
}
758758
}
759759
else if (el[0].is_string())
760760
{
761761
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
762762
for (auto tok : toks)
763763
{
764-
slot->sparams.logit_bias[tok] = bias;
764+
slot->sparams.logit_bias.push_back({tok, bias});
765765
}
766766
}
767767
}
768768
}
769769
}
770-
770+
771771
slot->params.antiprompt.clear();
772772

773773
const auto &stop = data.find("stop");
@@ -781,24 +781,22 @@ struct llama_server_context
781781
}
782782
}
783783
}
784-
785-
const auto &samplers_sequence = data.find("samplers");
786-
if (samplers_sequence != data.end() && samplers_sequence->is_array())
787-
{
784+
785+
const auto & samplers = data.find("samplers");
786+
if (samplers != data.end() && samplers->is_array()) {
788787
std::vector<std::string> sampler_names;
789-
for (const auto &sampler_name : *samplers_sequence)
790-
{
791-
if (sampler_name.is_string())
792-
{
793-
sampler_names.emplace_back(sampler_name);
788+
for (const auto & name : *samplers) {
789+
if (name.is_string()) {
790+
sampler_names.emplace_back(name);
791+
}
794792
}
795-
}
796-
slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
793+
slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
797794
}
798795
else
799796
{
800-
slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
797+
slot->sparams.samplers = default_sparams.samplers;
801798
}
799+
802800

803801
if (multimodal)
804802
{
@@ -875,10 +873,10 @@ struct llama_server_context
875873

876874
if (slot->ctx_sampling != nullptr)
877875
{
878-
llama_sampling_free(slot->ctx_sampling);
876+
gpt_sampler_free(slot->ctx_sampling);
879877
}
880-
slot->ctx_sampling = llama_sampling_init(slot->sparams);
881-
llama_set_rng_seed(ctx, slot->params.seed);
878+
slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
879+
//llama_set_rng_seed(ctx, slot->params.seed);
882880
slot->command = LOAD_PROMPT;
883881

884882
all_slots_are_idle = false;
@@ -888,7 +886,7 @@ struct llama_server_context
888886
{"task_id", slot->task_id},
889887
});
890888

891-
LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
889+
// LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
892890

893891
return true;
894892
}
@@ -1006,11 +1004,13 @@ struct llama_server_context
10061004
slot.generated_text += token_str;
10071005
slot.has_next_token = true;
10081006

1007+
/*
10091008
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
10101009
{
10111010
// we can change penalty_prompt_tokens because it is always created from scratch each request
10121011
slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
10131012
}
1013+
*/
10141014

10151015
// check if there is incomplete UTF-8 character at the end
10161016
bool incomplete = false;
@@ -1144,13 +1144,11 @@ struct llama_server_context
11441144

11451145
json get_formated_generation(llama_client_slot &slot)
11461146
{
1147-
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
1148-
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
1149-
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
1150-
std::vector<std::string> samplers_sequence;
1151-
for (const auto &sampler_type : slot.sparams.samplers_sequence)
1147+
std::vector<std::string> samplers;
1148+
samplers.reserve(slot.sparams.samplers.size());
1149+
for (const auto & sampler : slot.sparams.samplers)
11521150
{
1153-
samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
1151+
samplers.emplace_back(gpt_sampler_type_to_str(sampler));
11541152
}
11551153

11561154
return json {
@@ -1165,27 +1163,25 @@ struct llama_server_context
11651163
{"top_p", slot.sparams.top_p},
11661164
{"min_p", slot.sparams.min_p},
11671165
{"tfs_z", slot.sparams.tfs_z},
1168-
{"typical_p", slot.sparams.typical_p},
1166+
{"typical_p", slot.sparams.typ_p},
11691167
{"repeat_last_n", slot.sparams.penalty_last_n},
11701168
{"repeat_penalty", slot.sparams.penalty_repeat},
11711169
{"presence_penalty", slot.sparams.penalty_present},
11721170
{"frequency_penalty", slot.sparams.penalty_freq},
1173-
{"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
1174-
{"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
11751171
{"mirostat", slot.sparams.mirostat},
11761172
{"mirostat_tau", slot.sparams.mirostat_tau},
11771173
{"mirostat_eta", slot.sparams.mirostat_eta},
11781174
{"penalize_nl", slot.sparams.penalize_nl},
11791175
{"stop", slot.params.antiprompt},
11801176
{"n_predict", slot.params.n_predict},
11811177
{"n_keep", params.n_keep},
1182-
{"ignore_eos", ignore_eos},
1178+
{"ignore_eos", slot.sparams.ignore_eos},
11831179
{"stream", slot.params.stream},
1184-
{"logit_bias", slot.sparams.logit_bias},
1180+
// {"logit_bias", slot.sparams.logit_bias},
11851181
{"n_probs", slot.sparams.n_probs},
11861182
{"min_keep", slot.sparams.min_keep},
11871183
{"grammar", slot.sparams.grammar},
1188-
{"samplers", samplers_sequence}
1184+
{"samplers", samplers}
11891185
};
11901186
}
11911187

@@ -1714,7 +1710,7 @@ struct llama_server_context
17141710

17151711
if (!slot.params.cache_prompt)
17161712
{
1717-
llama_sampling_reset(slot.ctx_sampling);
1713+
gpt_sampler_reset(slot.ctx_sampling);
17181714

17191715
slot.n_past = 0;
17201716
slot.n_past_se = 0;
@@ -1726,7 +1722,7 @@ struct llama_server_context
17261722
// push the prompt into the sampling context (do not apply grammar)
17271723
for (auto &token : prompt_tokens)
17281724
{
1729-
llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
1725+
gpt_sampler_accept(slot.ctx_sampling, token, false);
17301726
}
17311727

17321728
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@@ -1934,9 +1930,9 @@ struct llama_server_context
19341930
}
19351931

19361932
completion_token_output result;
1937-
const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
1933+
const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
19381934

1939-
llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
1935+
gpt_sampler_accept(slot.ctx_sampling, id, true);
19401936

19411937
slot.n_decoded += 1;
19421938
if (slot.n_decoded == 1)
@@ -1946,19 +1942,14 @@ struct llama_server_context
19461942
metrics.on_prompt_eval(slot);
19471943
}
19481944

1949-
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
19501945
result.tok = id;
1946+
const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
19511947

1952-
const int32_t n_probs = slot.sparams.n_probs;
1953-
if (slot.sparams.temp <= 0 && n_probs > 0)
1954-
{
1955-
// for llama_sample_token_greedy we need to sort candidates
1956-
llama_sample_softmax(ctx, &cur_p);
1957-
}
1958-
1959-
for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
1960-
{
1961-
result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
1948+
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
1949+
result.probs.push_back({
1950+
cur_p->data[i].id,
1951+
i >= cur_p->size ? 0.0f : cur_p->data[i].p,
1952+
});
19621953
}
19631954

19641955
if (!process_token(result, slot))

0 commit comments

Comments
 (0)