@@ -2228,6 +2228,35 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
2228
2228
// }
2229
2229
// }
2230
2230
2231
+ const std::vector<ggml_type> kv_cache_types = {
2232
+ GGML_TYPE_F32,
2233
+ GGML_TYPE_F16,
2234
+ GGML_TYPE_BF16,
2235
+ GGML_TYPE_Q8_0,
2236
+ GGML_TYPE_Q4_0,
2237
+ GGML_TYPE_Q4_1,
2238
+ GGML_TYPE_IQ4_NL,
2239
+ GGML_TYPE_Q5_0,
2240
+ GGML_TYPE_Q5_1,
2241
+ };
2242
+
2243
+ static ggml_type kv_cache_type_from_str (const std::string & s) {
2244
+ for (const auto & type : kv_cache_types) {
2245
+ if (ggml_type_name (type) == s) {
2246
+ return type;
2247
+ }
2248
+ }
2249
+ throw std::runtime_error (" Unsupported cache type: " + s);
2250
+ }
2251
+
2252
+ static std::string get_all_kv_cache_types () {
2253
+ std::ostringstream msg;
2254
+ for (const auto & type : kv_cache_types) {
2255
+ msg << ggml_type_name (type) << (&type == &kv_cache_types.back () ? " " : " , " );
2256
+ }
2257
+ return msg.str ();
2258
+ }
2259
+
2231
2260
static void params_parse (const backend::ModelOptions* request,
2232
2261
common_params & params) {
2233
2262
@@ -2242,10 +2271,10 @@ static void params_parse(const backend::ModelOptions* request,
2242
2271
// params.model_alias ??
2243
2272
params.model_alias = request->modelfile ();
2244
2273
if (!request->cachetypekey ().empty ()) {
2245
- params.cache_type_k = request->cachetypekey ();
2274
+ params.cache_type_k = kv_cache_type_from_str ( request->cachetypekey () );
2246
2275
}
2247
2276
if (!request->cachetypevalue ().empty ()) {
2248
- params.cache_type_v = request->cachetypevalue ();
2277
+ params.cache_type_v = kv_cache_type_from_str ( request->cachetypevalue () );
2249
2278
}
2250
2279
params.n_ctx = request->contextsize ();
2251
2280
// params.memory_f16 = request->f16memory();
0 commit comments