Skip to content

Add DRY and fix the server to use other new samplers. #504

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
4920f2f
Skeleton
Jun 6, 2025
1a275ed
Non working
Jun 7, 2025
ce0b5f0
Well this is something.. it compiles
Jun 7, 2025
e8a1152
Debug and fixes
Jun 7, 2025
6d5c0a2
Cordon off dry debug
Jun 7, 2025
bac93da
Add new samplers to server, xtc or sigma did nothing before either
Jun 7, 2025
e05d780
Merge branch 'main' of https://github.com/ikawrakow/ik_llama.cpp
Jun 7, 2025
4edd4bc
Comments and attributions (approx)
Ph0rk0z Jun 8, 2025
cad53af
Merge branch 'main' into main
Ph0rk0z Jun 8, 2025
ed1388b
Doubled float_max.
Ph0rk0z Jun 8, 2025
401b88e
Remove LLM jank.
Ph0rk0z Jun 8, 2025
b5d7da3
Merge branch 'main' of https://github.com/Ph0rk0z/ik_llama.cpp
Jun 8, 2025
32dab5e
Take out ring buffer
Ph0rk0z Jun 8, 2025
758a92b
Merge branch 'main' of https://github.com/Ph0rk0z/ik_llama.cpp
Jun 8, 2025
9e70498
the impl function only does Z-algorithm, while the main function han…
Jun 8, 2025
ad4c815
Merge branch 'main' of https://github.com/ikawrakow/ik_llama.cpp
Jun 9, 2025
98f1eb2
Merge branch 'main' of https://github.com/ikawrakow/ik_llama.cpp
Jun 10, 2025
89cf632
Typos
Jun 10, 2025
b47fddc
Fix buffer ambiguity
Jun 10, 2025
d3da7a7
Merge branch 'main' of https://github.com/ikawrakow/ik_llama.cpp
Jun 11, 2025
c25be41
Merge branch 'main' of https://github.com/ikawrakow/ik_llama.cpp
Jun 11, 2025
6741c08
Merge branch 'main' into main
Ph0rk0z Jun 12, 2025
8e737ba
Merge branch 'main' of https://github.com/ikawrakow/ik_llama.cpp
Jun 13, 2025
486d4f3
Double declaration
Jun 13, 2025
ff76ae0
Merge branch 'main' of https://github.com/Ph0rk0z/ik_llama.cpp
Jun 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 57 additions & 4 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
sparams.top_n_sigma = std::stof(argv[i]);
return true;
}
if (arg == "--dry-multiplier") {
CHECK_ARG
sparams.dry_multiplier = std::stof(argv[i]);
return true;
}
if (arg == "--dry-base") {
CHECK_ARG
float potential_base = std::stof(argv[i]);
if (potential_base >= 1.0f) {
sparams.dry_base = potential_base;
}
return true;
}
if (arg == "--dry-allowed-length") {
CHECK_ARG
sparams.dry_allowed_length = std::stoi(argv[i]);
return true;
}
if (arg == "--dry-penalty-last-n") {
CHECK_ARG
sparams.dry_penalty_last_n = std::stoi(argv[i]);
return true;
}
if (arg == "--dry-sequence-breaker") {
CHECK_ARG
static bool defaults_cleared = false;

if (!defaults_cleared) {
sparams.dry_sequence_breakers.clear();
defaults_cleared = true;
}

if (std::string(argv[i]) == "none") {
sparams.dry_sequence_breakers.clear();
} else {
sparams.dry_sequence_breakers.emplace_back(argv[i]);
}
return true;
//add input checking
}
if (arg == "--cfg-negative-prompt") {
CHECK_ARG
sparams.cfg_negative_prompt = argv[i];
Expand Down Expand Up @@ -1668,12 +1708,21 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
options.push_back({ "*", " --xtc-probability p", "xtc probability (default: %.1f, 0.0 = disabled)", (double)sparams.xtc_probability });
options.push_back({ "*", " --xtc-threshold t", "xtc threshold (default: %.1f, >0.5 = disabled)", (double)sparams.xtc_threshold});
options.push_back({ "*", " --top-n-sigma t", "top-n-sigma parmeter (default: %.1f, 0.0 = disabled)", (double)sparams.top_n_sigma});
options.push_back({ "*", " --xtc-threshold t", "xtc threshold (default: %.1f, >0.5 = disabled)", (double)sparams.xtc_threshold });
options.push_back({ "*", " --top-n-sigma t", "top-n-sigma parmeter (default: %.1f, 0.0 = disabled)", (double)sparams.top_n_sigma });
options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
options.push_back({ "main", " --cfg-negative-prompt PROMPT",
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
sparams.dry_allowed_length });
options.push_back({ "*", " --dry-allowed-length N", "dry_allowed_length: (default: 2)",
(double)sparams.dry_base });
options.push_back({ "*", " --dry-base t", "dry_base: (default: 1.75)",
(double)sparams.dry_multiplier });
options.push_back({ "*", " --dry-multiplier t", "dry_multiplier: (default: 0.0)",
sparams.dry_penalty_last_n });
options.push_back({ "*", " --dry-penalty-last-n N", "dry_penalty_last_n: default: -1 (0 = disable, -1 = context size)"});

options.push_back({ "main", " --cfg-negative-prompt PROMPT",
"negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
"negative prompt file to use for guidance" });
Expand Down Expand Up @@ -3434,6 +3483,10 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
fprintf(stream, "xtc_threshold: %f # default: 0.0\n", sparams.xtc_threshold);
fprintf(stream, "top_n_sigma: %f # default: 0.0\n", sparams.top_n_sigma);
fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
Expand Down
57 changes: 37 additions & 20 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,15 +118,17 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
std::string llama_sampling_print(const llama_sampling_params & params) {
char result[1024];

snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f\n"
"\txtc_probability = %.3f, xtc_threshold = %.3f, top_n_sigma = %.3f",
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
params.mirostat, params.mirostat_eta, params.mirostat_tau,
params.xtc_probability, params.xtc_threshold, params.top_n_sigma);
snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f\n"
"\txtc_probability = %.3f, xtc_threshold = %.3f, top_n_sigma = %.3f\n"
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d",
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
params.mirostat, params.mirostat_eta, params.mirostat_tau,
params.xtc_probability, params.xtc_threshold, params.top_n_sigma,
params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n);

return std::string(result);
}
Expand Down Expand Up @@ -157,6 +159,7 @@ std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
case llama_sampler_type::TEMPERATURE: return "temperature";
case llama_sampler_type::XTC : return "xtc";
case llama_sampler_type::TOP_N_SIGMA: return "top_n_sigma";
case llama_sampler_type::DRY : return "dry";
default : return "";
}
}
Expand All @@ -170,6 +173,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vecto
{"tfs_z", llama_sampler_type::TFS_Z},
{"xtc", llama_sampler_type::XTC},
{"top_n_sigma", llama_sampler_type::TOP_N_SIGMA},
{"dry", llama_sampler_type::DRY},
{"temperature", llama_sampler_type::TEMPERATURE}
};

Expand All @@ -186,6 +190,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vecto
{"tfs", llama_sampler_type::TFS_Z},
{"xtc", llama_sampler_type::XTC},
{"top-n-sigma", llama_sampler_type::TOP_N_SIGMA},
{"dry", llama_sampler_type::DRY},
{"temp", llama_sampler_type::TEMPERATURE}
};

Expand Down Expand Up @@ -222,6 +227,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::strin
{'f', llama_sampler_type::TFS_Z},
{'x', llama_sampler_type::XTC},
{'n', llama_sampler_type::TOP_N_SIGMA},
{'d', llama_sampler_type::DRY},
{'t', llama_sampler_type::TEMPERATURE}
};

Expand All @@ -242,17 +248,22 @@ static void sampler_queue(
const llama_sampling_params & params,
llama_token_data_array & cur_p,
size_t min_keep) {
const float temp = params.temp;
const float dynatemp_range = params.dynatemp_range;
const float dynatemp_exponent = params.dynatemp_exponent;
const int32_t top_k = params.top_k;
const float top_p = params.top_p;
const float min_p = params.min_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const float xtc_probability = params.xtc_probability;
const float xtc_threshold = params.xtc_threshold;
const float top_n_sigma = params.top_n_sigma;
const float temp = params.temp;
const float dynatemp_range = params.dynatemp_range;
const float dynatemp_exponent = params.dynatemp_exponent;
const int32_t top_k = params.top_k;
const float top_p = params.top_p;
const float min_p = params.min_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const float xtc_probability = params.xtc_probability;
const float xtc_threshold = params.xtc_threshold;
const float top_n_sigma = params.top_n_sigma;
const float dry_multiplier = params.dry_multiplier;
const float dry_base = params.dry_base;
const int32_t dry_allowed_length = params.dry_allowed_length;
const int32_t dry_penalty_last_n = params.dry_penalty_last_n;

const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;

for (auto sampler_type : samplers_sequence) {
Expand All @@ -263,6 +274,10 @@ static void sampler_queue(
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
case llama_sampler_type::XTC : llama_sample_xtc (ctx_main, &cur_p, xtc_probability, xtc_threshold, min_keep); break;
case llama_sampler_type::DRY : llama_sample_dry (ctx_main, &cur_p, dry_multiplier, dry_base,
dry_allowed_length, dry_penalty_last_n,
params.dry_sequence_breakers);
break;
case llama_sampler_type::TOP_N_SIGMA: llama_sample_top_n_sigma(ctx_main, &cur_p, top_n_sigma); break;
case llama_sampler_type::TEMPERATURE:
if (dynatemp_range > 0) {
Expand Down Expand Up @@ -469,6 +484,8 @@ void llama_sampling_accept(
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
ctx_sampling->prev.push_back(id);

llama_sample_dry_accept_token(ctx_main, id);

if (ctx_sampling->grammar != NULL && apply_grammar) {
llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
}
Expand Down
8 changes: 7 additions & 1 deletion common/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@

// sampler types
enum class llama_sampler_type : char {
DRY ='d',
TOP_K = 'k',
TOP_P = 'p',
MIN_P = 'm',
TFS_Z = 'f',
XTC = 'x',
TOP_N_SIGMA = 'n',
DRY = 'd',
TYPICAL_P = 'y',
TEMPERATURE = 't'
};
Expand Down Expand Up @@ -45,6 +45,10 @@ typedef struct llama_sampling_params {
float xtc_probability = 0.0f; // xtc probability
float xtc_threshold = 1.0f; // xtc threshold, disabled if > 0.5
float top_n_sigma = 0.0f; // top-n-sigma
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
bool penalize_nl = false; // consider newlines as a repeatable token
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context

Expand All @@ -58,6 +62,8 @@ typedef struct llama_sampling_params {
llama_sampler_type::TEMPERATURE
};

std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY

std::string grammar; // optional BNF-like grammar to constrain sampling

// Classifier-Free Guidance
Expand Down
Loading