How exactly to set up the end-of-generation while directly using llama.dll API #14409

s7023369667 · 2025-06-27T03:55:56Z

s7023369667
Jun 27, 2025

I am trying to build a simple chat console application on .Net Framework.
I follow the sample code in llama.cpp -> examples -> simple and llama.cpp -> examples -> simple-chat to build.
Everything work great, but the problem is the model generation never stop generating until exceed the max token size. Model loading log showing the EOG maybe not setup right!

When I try to use the API llama_vocab_is_eog, it return true all the time !

if (llama_vocab_is_eog(vocab, nextToken))
{
    break;
}

Code:

static int Main(string[] args)
 {
     string model_path = string.Empty;
     // prompt to generate text from
     string prompt = "Hello my name is";
     // number of layers to offload to the GPU
     int ngl = 99;
     // number of tokens to predict
     int n_predict = 32;

     int i = 0;
     while (i < args.Length)
     {
         if (args[i] == "-m")
         {
             if (i + 1 < args.Length)
             {
                 model_path = args[++i];
             }
             else
             {
                 return 1;
             }
         }
         else if (args[i] == "-n")
         {
             if (i + 1 < args.Length && int.TryParse(args[++i], out n_predict)) { }
             else
             {
                 return 1;
             }
         }
         else if (args[i] == "-ngl")
         {
             if (i + 1 < args.Length && int.TryParse(args[++i], out ngl)) { }
             else
             {
                 return 1;
             }
         }
         else
         {
             // prompt starts here
             break;
         }
         i++;
     }

     if (string.IsNullOrWhiteSpace(model_path))
     {
         return 1;
     }

     if (i < args.Length)
     {
         prompt = args[i++];
         while (i < args.Length)
         {
             prompt += " " + args[i++];
         }
     }

     // Output results (for testing)
     Console.WriteLine($"Model Path: {model_path}");
     Console.WriteLine($"n_predict: {n_predict}");
     Console.WriteLine($"ngl: {ngl}");
     Console.WriteLine($"Prompt: {prompt}");

     ggml_backend_load_all();
     llama_model_params model_params = llama_model_default_params();
     model_params.n_gpu_layers = ngl;

     IntPtr model = llama_model_load_from_file(model_path, model_params);
     if (model == IntPtr.Zero)
     {
         throw new Exception("Failed to load model - check file path and format");
     }

     IntPtr vocab = llama_model_get_vocab(model);

     // find the number of tokens in the prompt
     byte[] promptBytes = Encoding.UTF8.GetBytes(prompt);

     int n_prompt = -llama_tokenize(vocab, promptBytes, promptBytes.Length, null, 0, 1, 1);
     int[] prompt_tokens = new int[n_prompt];
     if (llama_tokenize(vocab, promptBytes, promptBytes.Length, prompt_tokens, prompt_tokens.Length, 1, 1) < 0)
     {
         Console.WriteLine("Error: failed to tokenize the prompt");
     }

     llama_context_params ctx_params = llama_context_default_params();
     ctx_params.n_ctx = n_prompt + n_predict - 1;
     ctx_params.n_batch = n_prompt;
     ctx_params.no_perf = (byte)0;


     IntPtr ctx = llama_init_from_model(model, ctx_params);
     if (ctx == IntPtr.Zero)
     {
         throw new Exception("Failed to create context");
     }

     var sparams = new llama_sampler_chain_params();
     sparams.no_perf = (byte)0;
     IntPtr smpl = llama_sampler_chain_init(sparams);
     llama_sampler sampler_init = new llama_sampler();
     llama_sampler_chain_add(smpl, llama_sampler_init_dist(GetRngSeed(sampler_init.seed)));

     // print the prompt token - by - token
     foreach (int id in prompt_tokens)
     {
         byte[] buffer = new byte[128];
         int n = llama_token_to_piece(vocab, id, buffer, buffer.Length, 0, 1);
         if (n < 0)
         {
             Console.WriteLine("error: failed to convert token to piece");
             return 1;
         }
         string s = Encoding.UTF8.GetString(buffer, 0, n);
         Console.WriteLine($"{s}");
     }

     // prepare a batch for the prompt
     llama_batch batch = llama_batch_get_one(prompt_tokens, prompt_tokens.Length);
     //int t_main_start = ggml_time_us();
     int n_decode = 0;
     int new_token_id;

     int eosToken = llama_vocab_eos(vocab);
     int eotToken = llama_vocab_eot(vocab);

     for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict;)
     {
         int decodeResult = llama_decode(ctx, batch);
         if (decodeResult != 0)
         {
             Console.WriteLine($"failed to eval, return code {decodeResult}");
             return 1;
         }

         n_pos += batch.n_tokens;

         new_token_id = llama_sampler_sample(smpl, ctx, -1);

         if (new_token_id == eosToken || new_token_id == eotToken)
         {
             Console.WriteLine("End of sequence token encountered.");
             break;
         }

         byte[] buffer = new byte[128];
         int n = llama_token_to_piece(vocab, new_token_id, buffer, buffer.Length, 0, 1);
         if (n < 0)
         {
             Console.WriteLine("error: failed to convert token to piece");
             return 1;
         }
         string s = Encoding.UTF8.GetString(buffer, 0, n);
         Console.Write($"{s} ");

         int[] nextTokenIdArr = new int[] { new_token_id };
         batch = llama_batch_get_one(nextTokenIdArr, 1);

         n_decode += 1;
     }


     //var t_main_end = ggml_time_us();

     //Console.WriteLine($"decoded {n_decode} tokens in{(t_main_end - t_main_start) / 1000000.0f} s, speed: {n_decode / ((t_main_end - t_main_start) / 1000000.0f)} t/s");

     Console.WriteLine();
     llama_perf_sampler_print(smpl);
     llama_perf_context_print(ctx);
     Console.WriteLine();

     llama_sampler_free(smpl);
     llama_free(ctx);
     llama_model_free(model);
     return 0;

 }

Model Loading Log Output:

ngl: 20
Prompt: Hello my name is
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3050 Ti Laptop GPU, compute capability 8.6, VMM: yes
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3050 Ti Laptop GPU) - 3289 MiB free
llama_model_loader: loaded meta data with 37 key-value pairs and 399 tensors from C:\Users\posheng_cheng\.asusstudio\DeepSeek-R1-0528-Qwen3-8B-IQ4_NL\DeepSeek-R1-0528-Qwen3-8B-IQ4_NL.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen3
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Deepseek-R1-0528-Qwen3-8B
llama_model_loader: - kv   3:                           general.basename str              = Deepseek-R1-0528-Qwen3-8B
llama_model_loader: - kv   4:                       general.quantized_by str              = Unsloth
llama_model_loader: - kv   5:                         general.size_label str              = 8B
llama_model_loader: - kv   6:                           general.repo_url str              = https://huggingface.co/unsloth
llama_model_loader: - kv   7:                          qwen3.block_count u32              = 36
llama_model_loader: - kv   8:                       qwen3.context_length u32              = 131072
llama_model_loader: - kv   9:                     qwen3.embedding_length u32              = 4096
llama_model_loader: - kv  10:                  qwen3.feed_forward_length u32              = 12288
llama_model_loader: - kv  11:                 qwen3.attention.head_count u32              = 32
llama_model_loader: - kv  12:              qwen3.attention.head_count_kv u32              = 8
llama_model_loader: - kv  13:                       qwen3.rope.freq_base f32              = 1000000.000000
llama_model_loader: - kv  14:     qwen3.attention.layer_norm_rms_epsilon f32              = 0.000001
llama_model_loader: - kv  15:                 qwen3.attention.key_length u32              = 128
llama_model_loader: - kv  16:               qwen3.attention.value_length u32              = 128
llama_model_loader: - kv  17:                    qwen3.rope.scaling.type str              = yarn
llama_model_loader: - kv  18:                  qwen3.rope.scaling.factor f32              = 4.000000
llama_model_loader: - kv  19: qwen3.rope.scaling.original_context_length u32              = 32768
llama_model_loader: - kv  20:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  21:                         tokenizer.ggml.pre str              = qwen2
llama_model_loader: - kv  22:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  23:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  24:                      tokenizer.ggml.merges arr[str,151387]  = ["? ?", "?? ??", "i n", "? t",...
llama_model_loader: - kv  25:                tokenizer.ggml.bos_token_id u32              = 151643
llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 151645
llama_model_loader: - kv  27:            tokenizer.ggml.padding_token_id u32              = 151654
llama_model_loader: - kv  28:               tokenizer.ggml.add_bos_token bool             = false
llama_model_loader: - kv  29:               tokenizer.ggml.add_eos_token bool             = false
llama_model_loader: - kv  30:                    tokenizer.chat_template str              = {%- if not add_generation_prompt is d...
llama_model_loader: - kv  31:               general.quantization_version u32              = 2
llama_model_loader: - kv  32:                          general.file_type u32              = 25
llama_model_loader: - kv  33:                      quantize.imatrix.file str              = DeepSeek-R1-0528-Qwen3-8B-GGUF/imatri...
llama_model_loader: - kv  34:                   quantize.imatrix.dataset str              = unsloth_calibration_DeepSeek-R1-0528-...
llama_model_loader: - kv  35:             quantize.imatrix.entries_count u32              = 252
llama_model_loader: - kv  36:              quantize.imatrix.chunks_count u32              = 713
llama_model_loader: - type  f32:  145 tensors
llama_model_loader: - type q4_K:    1 tensors
llama_model_loader: - type q5_K:   36 tensors
llama_model_loader: - type q6_K:    1 tensors
llama_model_loader: - type iq4_nl:  216 tensors
print_info: file format = GGUF V3 (latest)
print_info: file type   = IQ4_NL - 4.5 bpw
print_info: file size   = 4.46 GiB (4.68 BPW)
init_tokenizer: initializing tokenizer for type 2
load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
load: control token: 151649 '<|box_end|>' is not marked as EOG
load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
load: control token: 151654 '<|vision_pad|>' is not marked as EOG
load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
load: control token: 151643 '<嚚egin?f?entence嚚?' is not marked as EOG
load: control token: 151648 '<|box_start|>' is not marked as EOG
load: control token: 151644 '<|im_start|>' is not marked as EOG
load: control token: 151645 '<嚚nd?f?entence嚚?' is not marked as EOG
load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
load: control token: 151650 '<|quad_start|>' is not marked as EOG
load: control token: 151651 '<|quad_end|>' is not marked as EOG
load: control token: 151652 '<|vision_start|>' is not marked as EOG
load: control token: 151653 '<|vision_end|>' is not marked as EOG
load: control token: 151655 '<|image_pad|>' is not marked as EOG
load: control token: 151656 '<|video_pad|>' is not marked as EOG
load: control token: 151660 '<|fim_middle|>' is not marked as EOG
load: control token: 151669 '<嚚ser嚚?' is not marked as EOG
load: control token: 151670 '<嚚ssistant嚚?' is not marked as EOG
load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
load: special tokens cache size = 28
load: token to piece cache size = 0.9311 MB
print_info: arch             = qwen3
print_info: vocab_only       = 0
print_info: n_ctx_train      = 131072
print_info: n_embd           = 4096
print_info: n_layer          = 36
print_info: n_head           = 32
print_info: n_head_kv        = 8
print_info: n_rot            = 128
print_info: n_swa            = 0
print_info: n_swa_pattern    = 1
print_info: n_embd_head_k    = 128
print_info: n_embd_head_v    = 128
print_info: n_gqa            = 4
print_info: n_embd_k_gqa     = 1024
print_info: n_embd_v_gqa     = 1024
print_info: f_norm_eps       = 0.0e+00
print_info: f_norm_rms_eps   = 1.0e-06
print_info: f_clamp_kqv      = 0.0e+00
print_info: f_max_alibi_bias = 0.0e+00
print_info: f_logit_scale    = 0.0e+00
print_info: f_attn_scale     = 0.0e+00
print_info: n_ff             = 12288
print_info: n_expert         = 0
print_info: n_expert_used    = 0
print_info: causal attn      = 1
print_info: pooling type     = 0
print_info: rope type        = 2
print_info: rope scaling     = yarn
print_info: freq_base_train  = 1000000.0
print_info: freq_scale_train = 0.25
print_info: n_ctx_orig_yarn  = 32768
print_info: rope_finetuned   = unknown
print_info: ssm_d_conv       = 0
print_info: ssm_d_inner      = 0
print_info: ssm_d_state      = 0
print_info: ssm_dt_rank      = 0
print_info: ssm_dt_b_c_rms   = 0
print_info: model type       = 8B
print_info: model params     = 8.19 B
print_info: general.name     = Deepseek-R1-0528-Qwen3-8B
print_info: vocab type       = BPE
print_info: n_vocab          = 151936
print_info: n_merges         = 151387
print_info: BOS token        = 151643 '<嚚egin?f?entence嚚?'
print_info: EOS token        = 151645 '<嚚nd?f?entence嚚?'
print_info: EOT token        = 151645 '<嚚nd?f?entence嚚?'
print_info: PAD token        = 151654 '<|vision_pad|>'
print_info: LF token         = 198 '?'
print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
print_info: FIM MID token    = 151660 '<|fim_middle|>'
print_info: FIM PAD token    = 151662 '<|fim_pad|>'
print_info: FIM REP token    = 151663 '<|repo_name|>'
print_info: FIM SEP token    = 151664 '<|file_sep|>'
print_info: EOG token        = 151645 '<嚚nd?f?entence嚚?'
print_info: EOG token        = 151662 '<|fim_pad|>'
print_info: EOG token        = 151663 '<|repo_name|>'
print_info: EOG token        = 151664 '<|file_sep|>'
print_info: max token length = 256
load_tensors: loading model tensors, this can take a while... (mmap = true)
load_tensors: layer   0 assigned to device CPU, is_swa = 0
load_tensors: layer   1 assigned to device CPU, is_swa = 0
load_tensors: layer   2 assigned to device CPU, is_swa = 0
load_tensors: layer   3 assigned to device CPU, is_swa = 0
load_tensors: layer   4 assigned to device CPU, is_swa = 0
load_tensors: layer   5 assigned to device CPU, is_swa = 0
load_tensors: layer   6 assigned to device CPU, is_swa = 0
load_tensors: layer   7 assigned to device CPU, is_swa = 0
load_tensors: layer   8 assigned to device CPU, is_swa = 0
load_tensors: layer   9 assigned to device CPU, is_swa = 0
load_tensors: layer  10 assigned to device CPU, is_swa = 0
load_tensors: layer  11 assigned to device CPU, is_swa = 0
load_tensors: layer  12 assigned to device CPU, is_swa = 0
load_tensors: layer  13 assigned to device CPU, is_swa = 0
load_tensors: layer  14 assigned to device CPU, is_swa = 0
load_tensors: layer  15 assigned to device CPU, is_swa = 0
load_tensors: layer  16 assigned to device CUDA0, is_swa = 0
load_tensors: layer  17 assigned to device CUDA0, is_swa = 0
load_tensors: layer  18 assigned to device CUDA0, is_swa = 0
load_tensors: layer  19 assigned to device CUDA0, is_swa = 0
load_tensors: layer  20 assigned to device CUDA0, is_swa = 0
load_tensors: layer  21 assigned to device CUDA0, is_swa = 0
load_tensors: layer  22 assigned to device CUDA0, is_swa = 0
load_tensors: layer  23 assigned to device CUDA0, is_swa = 0
load_tensors: layer  24 assigned to device CUDA0, is_swa = 0
load_tensors: layer  25 assigned to device CUDA0, is_swa = 0
load_tensors: layer  26 assigned to device CUDA0, is_swa = 0
load_tensors: layer  27 assigned to device CUDA0, is_swa = 0
load_tensors: layer  28 assigned to device CUDA0, is_swa = 0
load_tensors: layer  29 assigned to device CUDA0, is_swa = 0
load_tensors: layer  30 assigned to device CUDA0, is_swa = 0
load_tensors: layer  31 assigned to device CUDA0, is_swa = 0
load_tensors: layer  32 assigned to device CUDA0, is_swa = 0
load_tensors: layer  33 assigned to device CUDA0, is_swa = 0
load_tensors: layer  34 assigned to device CUDA0, is_swa = 0
load_tensors: layer  35 assigned to device CUDA0, is_swa = 0
load_tensors: layer  36 assigned to device CPU, is_swa = 0
load_tensors: tensor 'token_embd.weight' (q4_K) (and 178 others) cannot be used with preferred buffer type CUDA_Host, using CPU instead
load_tensors: offloading 20 repeating layers to GPU
load_tensors: offloaded 20/37 layers to GPU
load_tensors:        CUDA0 model buffer size =  2080.64 MiB
load_tensors:   CPU_Mapped model buffer size =  2485.23 MiB
....................................................................................
llama_context: constructing llama_context
llama_context: n_batch is less than GGML_KQ_MASK_PAD - increasing to 64
llama_context: n_seq_max     = 1
llama_context: n_ctx         = 4099
llama_context: n_ctx_per_seq = 4099
llama_context: n_batch       = 64
llama_context: n_ubatch      = 64
llama_context: causal_attn   = 1
llama_context: flash_attn    = 0
llama_context: freq_base     = 1000000.0
llama_context: freq_scale    = 0.25
llama_context: n_ctx_per_seq (4099) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
set_abort_callback: call
llama_context:        CPU  output buffer size =     0.58 MiB
create_memory: n_ctx = 4128 (padded)
llama_kv_cache_unified: layer   0: dev = CPU
llama_kv_cache_unified: layer   1: dev = CPU
llama_kv_cache_unified: layer   2: dev = CPU
llama_kv_cache_unified: layer   3: dev = CPU
llama_kv_cache_unified: layer   4: dev = CPU
llama_kv_cache_unified: layer   5: dev = CPU
llama_kv_cache_unified: layer   6: dev = CPU
llama_kv_cache_unified: layer   7: dev = CPU
llama_kv_cache_unified: layer   8: dev = CPU
llama_kv_cache_unified: layer   9: dev = CPU
llama_kv_cache_unified: layer  10: dev = CPU
llama_kv_cache_unified: layer  11: dev = CPU
llama_kv_cache_unified: layer  12: dev = CPU
llama_kv_cache_unified: layer  13: dev = CPU
llama_kv_cache_unified: layer  14: dev = CPU
llama_kv_cache_unified: layer  15: dev = CPU
llama_kv_cache_unified: layer  16: dev = CUDA0
llama_kv_cache_unified: layer  17: dev = CUDA0
llama_kv_cache_unified: layer  18: dev = CUDA0
llama_kv_cache_unified: layer  19: dev = CUDA0
llama_kv_cache_unified: layer  20: dev = CUDA0
llama_kv_cache_unified: layer  21: dev = CUDA0
llama_kv_cache_unified: layer  22: dev = CUDA0
llama_kv_cache_unified: layer  23: dev = CUDA0
llama_kv_cache_unified: layer  24: dev = CUDA0
llama_kv_cache_unified: layer  25: dev = CUDA0
llama_kv_cache_unified: layer  26: dev = CUDA0
llama_kv_cache_unified: layer  27: dev = CUDA0
llama_kv_cache_unified: layer  28: dev = CUDA0
llama_kv_cache_unified: layer  29: dev = CUDA0
llama_kv_cache_unified: layer  30: dev = CUDA0
llama_kv_cache_unified: layer  31: dev = CUDA0
llama_kv_cache_unified: layer  32: dev = CUDA0
llama_kv_cache_unified: layer  33: dev = CUDA0
llama_kv_cache_unified: layer  34: dev = CUDA0
llama_kv_cache_unified: layer  35: dev = CUDA0
llama_kv_cache_unified:      CUDA0 KV buffer size =   322.50 MiB
llama_kv_cache_unified:        CPU KV buffer size =   258.00 MiB
llama_kv_cache_unified: size =  580.50 MiB (  4128 cells,  36 layers,  1 seqs), K (f16):  290.25 MiB, V (f16):  290.25 MiB
llama_context: enumerating backends
llama_context: backend_ptrs.size() = 2
llama_context: max_nodes = 65536
llama_context: worst-case: n_tokens = 64, n_seqs = 1, n_outputs = 0
llama_context: reserving graph for n_tokens = 64, n_seqs = 1
llama_context: reserving graph for n_tokens = 1, n_seqs = 1
llama_context: reserving graph for n_tokens = 64, n_seqs = 1
llama_context:      CUDA0 compute buffer size =   524.95 MiB
llama_context:  CUDA_Host compute buffer size =     2.01 MiB
llama_context: graph nodes  = 1446
llama_context: graph splits = 212 (with bs=64), 35 (with bs=1)
        repeat_last_n = 64, repeat_penalty = 1, frequency_penalty = 0, presence_penalty = 0
        dry_multiplier = 0, dry_base = 1.75, dry_allowed_length = 2, dry_penalty_last_n = -1
        top_k = 40, top_p = 0.95, min_p = 0.05, xtc_probability = 0, xtc_threshold = 0.1, typical_p = 1, top_n_sigma = -1, temp = 0.8
        mirostat = 0, mirostat_lr = 0.1, mirostat_ent = 5

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

How exactly to set up the end-of-generation while directly using llama.dll API #14409

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

How exactly to set up the end-of-generation while directly using llama.dll API #14409

Uh oh!

Uh oh!

s7023369667 Jun 27, 2025

Replies: 0 comments

s7023369667
Jun 27, 2025