@@ -618,6 +618,11 @@ struct llama_server_context
618
618
619
619
std::vector<llama_token> tokenize (const json & json_prompt, bool add_bos) const
620
620
{
621
+ // TODO: currently, we tokenize using special tokens by default
622
+ // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
623
+ // but it's better compared to completely ignoring ChatML and other chat templates
624
+ const bool TMP_FORCE_SPECIAL = true ;
625
+
621
626
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
622
627
// or the first element of the json_prompt array is a string.
623
628
std::vector<llama_token> prompt_tokens;
@@ -633,12 +638,12 @@ struct llama_server_context
633
638
std::vector<llama_token> p;
634
639
if (first)
635
640
{
636
- p = ::llama_tokenize (ctx, s, add_bos);
641
+ p = ::llama_tokenize (ctx, s, add_bos, TMP_FORCE_SPECIAL );
637
642
first = false ;
638
643
}
639
644
else
640
645
{
641
- p = ::llama_tokenize (ctx, s, false );
646
+ p = ::llama_tokenize (ctx, s, false , TMP_FORCE_SPECIAL );
642
647
}
643
648
prompt_tokens.insert (prompt_tokens.end (), p.begin (), p.end ());
644
649
}
@@ -655,7 +660,7 @@ struct llama_server_context
655
660
else
656
661
{
657
662
auto s = json_prompt.template get <std::string>();
658
- prompt_tokens = ::llama_tokenize (ctx, s, add_bos);
663
+ prompt_tokens = ::llama_tokenize (ctx, s, add_bos, TMP_FORCE_SPECIAL );
659
664
}
660
665
661
666
return prompt_tokens;
@@ -2235,7 +2240,7 @@ std::string format_chatml(std::vector<json> messages)
2235
2240
2236
2241
for (auto it = messages.begin (); it != messages.end (); ++it) {
2237
2242
chatml_msgs << " <|im_start|>"
2238
- << json_value (*it, " role" , std::string (" user" )) << ' \n ' ;
2243
+ << json_value (*it, " role" , std::string (" user" )) << ' \n ' ;
2239
2244
chatml_msgs << json_value (*it, " content" , std::string (" " ))
2240
2245
<< " <|im_end|>\n " ;
2241
2246
}
0 commit comments