server : enable special tokens during tokenization by default

ggerganov · ggerganov · commit c544faed7492 · 2023-11-24T11:10:41.000+02:00
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -618,6 +618,11 @@ struct llama_server_context
 
     std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
     {
+        // TODO: currently, we tokenize using special tokens by default
+        //       this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
+        //       but it's better compared to completely ignoring ChatML and other chat templates
+        const bool TMP_FORCE_SPECIAL = true;
+
         // If `add_bos` is true, we only add BOS, when json_prompt is a string,
         // or the first element of the json_prompt array is a string.
         std::vector<llama_token> prompt_tokens;
@@ -633,12 +638,12 @@ struct llama_server_context
                     std::vector<llama_token> p;
                     if (first)
                     {
-                        p = ::llama_tokenize(ctx, s, add_bos);
+                        p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
                         first = false;
                     }
                     else
                     {
-                        p = ::llama_tokenize(ctx, s, false);
+                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
                     }
                     prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
                 }
@@ -655,7 +660,7 @@ struct llama_server_context
         else
         {
             auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+            prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
         }
 
         return prompt_tokens;
@@ -2235,7 +2240,7 @@ std::string format_chatml(std::vector<json> messages)
 
     for (auto it = messages.begin(); it != messages.end(); ++it) {
         chatml_msgs << "<|im_start|>"
-                    << json_value(*it, "role", std::string("user")) << '\n';
+                    << json_value(*it, "role",    std::string("user")) << '\n';
         chatml_msgs << json_value(*it, "content", std::string(""))
                     << "<|im_end|>\n";
     }

Original file line number	Diff line number	Diff line change
`@@ -618,6 +618,11 @@ struct llama_server_context`
`618`	`618`
`619`	`619`	`std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const`
`620`	`620`	`{`
	`621`	`+ // TODO: currently, we tokenize using special tokens by default`
	`622`	`+ // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)`
	`623`	`+ // but it's better compared to completely ignoring ChatML and other chat templates`
	`624`	`+ const bool TMP_FORCE_SPECIAL = true;`
	`625`	`+`
`621`	`626`	// If `add_bos` is true, we only add BOS, when json_prompt is a string,
`622`	`627`	`// or the first element of the json_prompt array is a string.`
`623`	`628`	`std::vector<llama_token> prompt_tokens;`
`@@ -633,12 +638,12 @@ struct llama_server_context`
`633`	`638`	`std::vector<llama_token> p;`
`634`	`639`	`if (first)`
`635`	`640`	`{`
`636`		`- p = ::llama_tokenize(ctx, s, add_bos);`
	`641`	`+ p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);`
`637`	`642`	`first = false;`
`638`	`643`	`}`
`639`	`644`	`else`
`640`	`645`	`{`
`641`		`- p = ::llama_tokenize(ctx, s, false);`
	`646`	`+ p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);`
`642`	`647`	`}`
`643`	`648`	`prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());`
`644`	`649`	`}`
`@@ -655,7 +660,7 @@ struct llama_server_context`
`655`	`660`	`else`
`656`	`661`	`{`
`657`	`662`	`auto s = json_prompt.template get<std::string>();`
`658`		`- prompt_tokens = ::llama_tokenize(ctx, s, add_bos);`
	`663`	`+ prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);`
`659`	`664`	`}`
`660`	`665`
`661`	`666`	`return prompt_tokens;`
`@@ -2235,7 +2240,7 @@ std::string format_chatml(std::vector<json> messages)`
`2235`	`2240`
`2236`	`2241`	`for (auto it = messages.begin(); it != messages.end(); ++it) {`
`2237`	`2242`	`chatml_msgs << "<\|im_start\|>"`
`2238`		`- << json_value(*it, "role", std::string("user")) << '\n';`
	`2243`	`+ << json_value(*it, "role", std::string("user")) << '\n';`
`2239`	`2244`	`chatml_msgs << json_value(*it, "content", std::string(""))`
`2240`	`2245`	`<< "<\|im_end\|>\n";`
`2241`	`2246`	`}`