Skip to content

Commit 6598121

Browse files
authored
July Update v2 (#843)
* Update to llama.cpp 368645698ab648e390dcd7c00a2bf60efa654f57. * Added some extra logging to test case to debug failure on MacOS * Fixed signature mismatch
1 parent 6929336 commit 6598121

16 files changed

+213
-78
lines changed

LLama.Unittest/TemplateTests.cs

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
11
using System.Text;
22
using LLama.Common;
3+
using LLama.Extensions;
4+
using LLama.Native;
5+
using Xunit.Abstractions;
36

47
namespace LLama.Unittest;
58

69
public sealed class TemplateTests
710
: IDisposable
811
{
12+
private readonly ITestOutputHelper _output;
913
private readonly LLamaWeights _model;
1014

11-
public TemplateTests()
15+
public TemplateTests(ITestOutputHelper output)
1216
{
17+
_output = output;
1318
var @params = new ModelParams(Constants.GenerativeModelPath)
1419
{
1520
ContextSize = 1,
@@ -260,6 +265,37 @@ public void EndOTurnToken_ReturnsExpected()
260265
[Fact]
261266
public void EndOSpeechToken_ReturnsExpected()
262267
{
268+
_output.WriteLine($"EOS: {_model.Tokens.EOS}");
269+
_output.WriteLine($"EOT: {_model.Tokens.EOT}");
270+
_output.WriteLine($"BOS: {_model.Tokens.BOS}");
271+
272+
var eosStr = ConvertTokenToString(_model.Tokens.EOS!.Value);
273+
_output.WriteLine(eosStr ?? "null");
274+
263275
Assert.Equal("</s>", _model.Tokens.EndOfSpeechToken);
264276
}
277+
278+
private string? ConvertTokenToString(LLamaToken token)
279+
{
280+
_output.WriteLine($"ConvertTokenToString: {token}");
281+
282+
const int buffSize = 32;
283+
Span<byte> buff = stackalloc byte[buffSize];
284+
var tokenLength = _model.NativeHandle.TokenToSpan(token, buff, 0, true);
285+
286+
_output.WriteLine($"tokenLength = {tokenLength}");
287+
if (tokenLength <= 0)
288+
return null;
289+
290+
// if the original buffer wasn't large enough, create a new one
291+
_output.WriteLine($"tokenLength = {tokenLength}, buffSize = {buffSize}");
292+
if (tokenLength > buffSize)
293+
{
294+
buff = stackalloc byte[(int)tokenLength];
295+
_ = _model.NativeHandle.TokenToSpan(token, buff, 0, true);
296+
}
297+
298+
var slice = buff.Slice(0, (int)tokenLength);
299+
return Encoding.UTF8.GetStringFromSpan(slice);
300+
}
265301
}

LLama.Web/Common/ModelOptions.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,5 +118,8 @@ public class ModelOptions
118118

119119
/// <inheritdoc />
120120
public LLamaPoolingType PoolingType { get; set; }
121+
122+
/// <inheritdoc />
123+
public LLamaAttentionType AttentionType { get; set; } = LLamaAttentionType.Unspecified;
121124
}
122125
}

LLama/Abstractions/IContextParams.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,4 +123,9 @@ public interface IContextParams
123123
/// How to pool (sum) embedding results by sequence id (ignored if no pooling layer)
124124
/// </summary>
125125
LLamaPoolingType PoolingType { get; }
126+
127+
/// <summary>
128+
/// Attention type to use for embeddings
129+
/// </summary>
130+
LLamaAttentionType AttentionType { get; }
126131
}

LLama/Common/ModelParams.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,9 @@ public record ModelParams
109109
/// <inheritdoc />
110110
public LLamaPoolingType PoolingType { get; set; } = LLamaPoolingType.Unspecified;
111111

112+
/// <inheritdoc />
113+
public LLamaAttentionType AttentionType { get; set; } = LLamaAttentionType.Unspecified;
114+
112115
/// <inheritdoc />
113116
public bool VocabOnly { get; set; }
114117

LLama/Extensions/IContextParamsExtensions.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
5252
result.offload_kqv = !@params.NoKqvOffload;
5353
result.flash_attention = @params.FlashAttention;
5454
result.llama_pooling_type = @params.PoolingType;
55+
result.attention_type = @params.AttentionType;
5556

5657
result.n_threads = Threads(@params.Threads);
5758
result.n_threads_batch = Threads(@params.BatchThreads);

LLama/LLamaQuantizer.cs

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
using LLama.Native;
1+
using LLama.Native;
22
using System;
33
using System.Collections.Generic;
44

@@ -66,49 +66,49 @@ private static bool ValidateFtype(LLamaFtype ftype)
6666

6767
switch (ftype)
6868
{
69-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0:
70-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1:
71-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0:
72-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1:
73-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0:
74-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_F16:
75-
case LLamaFtype.LLAMA_FTYPE_ALL_F32:
69+
case LLamaFtype.MOSTLY_Q4_0:
70+
case LLamaFtype.MOSTLY_Q4_1:
71+
case LLamaFtype.MOSTLY_Q5_0:
72+
case LLamaFtype.MOSTLY_Q5_1:
73+
case LLamaFtype.MOSTLY_Q8_0:
74+
case LLamaFtype.MOSTLY_F16:
75+
case LLamaFtype.ALL_F32:
7676

77-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K_S:
78-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K:
77+
case LLamaFtype.MOSTLY_Q2_K_S:
78+
case LLamaFtype.MOSTLY_Q2_K:
7979

80-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_K_XS:
81-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_S:
82-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_M:
83-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_L:
80+
case LLamaFtype.MOSTLY_IQ3_K_XS:
81+
case LLamaFtype.MOSTLY_Q3_K_S:
82+
case LLamaFtype.MOSTLY_Q3_K_M:
83+
case LLamaFtype.MOSTLY_Q3_K_L:
8484

85-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_S:
86-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_M:
85+
case LLamaFtype.MOSTLY_Q4_K_S:
86+
case LLamaFtype.MOSTLY_Q4_K_M:
8787

88-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_S:
89-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_M:
88+
case LLamaFtype.MOSTLY_Q5_K_S:
89+
case LLamaFtype.MOSTLY_Q5_K_M:
9090

91-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q6_K:
91+
case LLamaFtype.MOSTLY_Q6_K:
9292

93-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_XXS:
94-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_XS:
95-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_S:
96-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_M:
93+
case LLamaFtype.MOSTLY_IQ2_XXS:
94+
case LLamaFtype.MOSTLY_IQ2_XS:
95+
case LLamaFtype.MOSTLY_IQ2_S:
96+
case LLamaFtype.MOSTLY_IQ2_M:
9797

98-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_XXS:
98+
case LLamaFtype.MOSTLY_IQ3_XXS:
9999

100-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ1_S:
101-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ1_M:
100+
case LLamaFtype.MOSTLY_IQ1_S:
101+
case LLamaFtype.MOSTLY_IQ1_M:
102102

103-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ4_NL:
104-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ4_XS:
103+
case LLamaFtype.MOSTLY_IQ4_NL:
104+
case LLamaFtype.MOSTLY_IQ4_XS:
105105

106-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_S:
107-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_M:
106+
case LLamaFtype.MOSTLY_IQ3_S:
107+
case LLamaFtype.MOSTLY_IQ3_M:
108108
return true;
109109

110-
case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
111-
case LLamaFtype.LLAMA_FTYPE_GUESSED:
110+
case LLamaFtype.MOSTLY_Q4_1_SOME_F16:
111+
case LLamaFtype.GUESSED:
112112
default:
113113
return false;
114114
}

LLama/LLamaSharp.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
</ItemGroup>
5454

5555
<PropertyGroup>
56-
<BinaryReleaseId>1c5eba6f8e62</BinaryReleaseId>
56+
<BinaryReleaseId>368645698ab648e390dc</BinaryReleaseId>
5757
</PropertyGroup>
5858

5959
<PropertyGroup>

LLama/Native/LLamaAttentionType.cs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
namespace LLama.Native;
2+
3+
public enum LLamaAttentionType
4+
{
5+
Unspecified = -1,
6+
Causal = 0,
7+
NonCausal = 1,
8+
}

LLama/Native/LLamaContextParams.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ public struct LLamaContextParams
6565
/// whether to pool (sum) embedding results by sequence id
6666
/// </summary>
6767
public LLamaPoolingType llama_pooling_type;
68+
69+
/// <summary>
70+
/// Attention type to use for embeddings
71+
/// </summary>
72+
public LLamaAttentionType attention_type;
6873

6974
/// <summary>
7075
/// RoPE base frequency, 0 = from model

0 commit comments

Comments
 (0)