-
Notifications
You must be signed in to change notification settings - Fork 9
feat: add llamacpp params #221
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -480,6 +480,10 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) { | |
if (!params.use_mmap) { | ||
LOG_DEBUG << "Disabled mmap"; | ||
} | ||
params.n_predict = json_body->get("n_predict", -1).asInt(); | ||
params.prompt = json_body->get("prompt", "").asString(); | ||
params.conversation = json_body->get("conversation", false).asBool(); | ||
params.special = json_body->get("special", false).asBool(); | ||
|
||
server_map_[model_id].caching_enabled = | ||
json_body->get("caching_enabled", true).asBool(); | ||
|
@@ -599,6 +603,24 @@ void LlamaEngine::HandleInferenceImpl( | |
data["temperature"] = completion.temperature; | ||
data["frequency_penalty"] = completion.frequency_penalty; | ||
data["presence_penalty"] = completion.presence_penalty; | ||
data["seed"] = completion.seed; | ||
data["dynatemp_range"] = completion.dynatemp_range; | ||
data["dynatemp_exponent"] = completion.dynatemp_exponent; | ||
data["top_k"] = completion.top_k; | ||
data["min_p"] = completion.min_p; | ||
data["tfs_z"] = completion.tfs_z; | ||
data["typical_p"] = completion.typ_p; | ||
data["repeat_last_n"] = completion.repeat_last_n; | ||
data["repeat_penalty"] = completion.penalty_repeat; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Woah, is there a way for us to align our
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's impossible because the data is a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. about the |
||
data["mirostat"] = completion.mirostat; | ||
data["mirostat_tau"] = completion.mirostat_tau; | ||
data["mirostat_eta"] = completion.mirostat_eta; | ||
data["penalize_nl"] = completion.penalize_nl; | ||
data["ignore_eos"] = completion.ignore_eos; | ||
data["n_probs"] = completion.n_probs; | ||
data["min_keep"] = completion.min_keep; | ||
data["grammar"] = completion.grammar; | ||
int n_probs = completion.n_probs; | ||
const Json::Value& messages = completion.messages; | ||
|
||
if (!si.grammar_file_content.empty()) { | ||
|
@@ -717,12 +739,17 @@ void LlamaEngine::HandleInferenceImpl( | |
auto state = CreateInferenceState(si.ctx); | ||
|
||
// Queued task | ||
si.q->runTaskInQueue([cb = std::move(callback), state, data, request_id]() { | ||
si.q->runTaskInQueue([cb = std::move(callback), state, data, request_id, n_probs]() { | ||
state->task_id = state->llama.RequestCompletion(data, false, false, -1); | ||
while (state->llama.model_loaded_external) { | ||
TaskResult result = state->llama.NextResult(state->task_id); | ||
if (!result.error) { | ||
std::string to_send = result.result_json["content"]; | ||
std::string to_send; | ||
if (n_probs > 0){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can I verify my understanding about
We should align with the conventions in llama.cpp's server, as much as possible ResourcesThere was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. our implementation can return this form ggml-org/llama.cpp#4088 (comment), both content and list of (token - probs) for each token |
||
to_send = result.result_json["completion_probabilities"].dump(); | ||
}else{ | ||
to_send = result.result_json["content"]; | ||
} | ||
// trim the leading space if it is the first token | ||
if (std::exchange(state->is_first_token, false)) { | ||
llama_utils::ltrim(to_send); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
#include "llama_server_context.h" | ||
|
||
#include "sampling.h" | ||
namespace { | ||
const std::string base64_chars = | ||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" | ||
|
@@ -458,6 +458,15 @@ bool LlamaServerContext::LaunchSlotWithData(LlamaClientSlot*& slot, json data) { | |
slot->params.seed = json_value(data, "seed", default_params.seed); | ||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); | ||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); | ||
slot->sparams.min_keep = | ||
json_value(data, "min_keep", default_sparams.min_keep); | ||
slot->sparams.seed = json_value(data, "seed", default_sparams.seed); | ||
slot->sparams.dynatemp_range = | ||
json_value(data, "dynatemp_range", default_sparams.dynatemp_range); | ||
slot->sparams.dynatemp_exponent = | ||
json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); | ||
slot->sparams.ignore_eos = | ||
json_value(data, "ignore_eos", default_sparams.ignore_eos); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can I check my understanding:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, the return n probs is in serveral place inside codebase. With |
||
|
||
// infill | ||
if (data.count("input_prefix") != 0) { | ||
|
@@ -969,8 +978,13 @@ void LlamaServerContext::SendFinalResponse(LlamaClientSlot& slot) { | |
slot.generated_token_probs.begin(), | ||
slot.generated_token_probs.begin() + slot.sent_token_probs_index); | ||
} | ||
res.result_json["completion_probabilities"] = | ||
if(!slot.params.stream ){ | ||
res.result_json["completion_probabilities"] = | ||
probs_vector_to_json(ctx, probs); | ||
} | ||
else{ | ||
res.result_json["completion_probabilities"] = std::move(json()); | ||
} | ||
} | ||
|
||
if (slot.oaicompat) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I notice this PR defines default values twice:
Are we able to define once?
DRY principle: https://en.wikipedia.org/wiki/Don%27t_repeat_yourself
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I followed the previous implementation like this one https://github.com/janhq/cortex.llamacpp/blob/main/src/chat_completion_request.h#L8, maybe some weird bug in the past force us to do it. Like this PR I fixed the race condition, even if we checked the everything code (mutex, only return slot if available, ... ) but error still pop up. So I have to add another check for
if (slot==null)
so that the issue can be resolved.We are using third party lib for
json
so I think it's no harm to double check and make sure it work well, but if it necessary, I'll change it, but we need to test more. to make sure it won't break any thing