Merge pull request #874 from martindevans/llama_3.1_update_binaries

martindevans · web-flow · commit bb2f3adc0a88 · 2024-08-03T15:48:22.000+01:00
Llama 3.1 update binaries
diff --git a/LLama.Unittest/BasicTest.cs b/LLama.Unittest/BasicTest.cs
@@ -1,6 +1,4 @@
-using System.Text;
 using LLama.Common;
-using LLama.Native;
 using Xunit.Abstractions;
 
 namespace LLama.Unittest
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -47,12 +47,6 @@ public class ModelOptions
         /// <inheritdoc />
         public string ModelPath { get; set; }
 
-        /// <inheritdoc />
-        public AdapterCollection LoraAdapters { get; set; } = new();
-
-        /// <inheritdoc />
-        public string LoraBase { get; set; } = string.Empty;
-
         /// <inheritdoc />
         public uint? Threads { get; set; }
 
diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
@@ -2,7 +2,6 @@
 using System.Collections;
 using System.Collections.Generic;
 using System.ComponentModel;
-using System.Linq;
 using System.Text;
 using System.Text.Json;
 using System.Text.Json.Serialization;
@@ -69,67 +68,12 @@ public interface IModelParams
         /// </summary>
         bool VocabOnly { get; }
 
-        /// <summary>
-        /// List of LoRA adapters to apply
-        /// </summary>
-        AdapterCollection LoraAdapters { get; }
-
-        /// <summary>
-        /// base model path for the lora adapter (lora_base)
-        /// </summary>
-        string LoraBase { get; }
-
         /// <summary>
         /// Override specific metadata items in the model
         /// </summary>
         List<MetadataOverride> MetadataOverrides { get; }
     }
 
-    /// <summary>
-    /// A LoRA adapter to apply to a model
-    /// </summary>
-    /// <param name="Path">Path to the LoRA file</param>
-    /// <param name="Scale">Strength of this LoRA</param>
-    public readonly record struct LoraAdapter(string Path, float Scale);
-
-    /// <summary>
-    /// A list of LoraAdapter objects
-    /// </summary>
-    public sealed class AdapterCollection
-        : List<LoraAdapter>, IEquatable<AdapterCollection>
-    {
-        /// <inheritdoc />
-        public bool Equals(AdapterCollection? other)
-        {
-            if (other == null)
-                return false;
-
-            return this.SequenceEqual(other);
-        }
-
-        /// <inheritdoc/>
-        public override bool Equals(object? obj)
-        {
-            return Equals(obj as AdapterCollection);
-        }
-
-        /// <inheritdoc/>
-        public override int GetHashCode()
-        {
-            unchecked
-            {
-                var hash = 17;
-                for (var i = 0; i < Count; i++)
-                {
-                    hash += this[i].GetHashCode();
-                    hash *= 7823;
-                }
-                return hash;
-            }
-        }
-    }
-
-
     /// <summary>
     /// A fixed size array to set the tensor splits across multiple GPUs
     /// </summary>
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -39,12 +39,6 @@ public record ModelParams
         /// <inheritdoc />
         public string ModelPath { get; set; }
 
-        /// <inheritdoc />
-        public AdapterCollection LoraAdapters { get; set; } = new();
-
-        /// <inheritdoc />
-        public string LoraBase { get; set; } = string.Empty;
-
         /// <inheritdoc />
         public uint? Threads { get; set; }
 
diff --git a/LLama/LLamaQuantizer.cs b/LLama/LLamaQuantizer.cs
@@ -62,7 +62,7 @@ public static bool Quantize(string srcFileName, string dstFilename, string ftype
         private static bool ValidateFtype(LLamaFtype ftype)
         {
             // Validation copies from here:
-            // https://github.com/ggerganov/llama.cpp/blob/f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7/llama.cpp#L13450
+            // https://github.com/ggerganov/llama.cpp/blob/345c8c0c87a97c1595f9c8b14833d531c8c7d8df/src/llama.cpp#L15624
 
             switch (ftype)
             {
@@ -105,9 +105,12 @@ private static bool ValidateFtype(LLamaFtype ftype)
 
                 case LLamaFtype.MOSTLY_IQ3_S:
                 case LLamaFtype.MOSTLY_IQ3_M:
+
+                case LLamaFtype.MOSTLY_Q4_0_4_4:
+                case LLamaFtype.MOSTLY_Q4_0_4_8:
+                case LLamaFtype.MOSTLY_Q4_0_8_8:
                     return true;
 
-                case LLamaFtype.MOSTLY_Q4_1_SOME_F16:
                 case LLamaFtype.GUESSED:
                 default:
                     return false;
diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
@@ -53,7 +53,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>368645698ab648e390dc</BinaryReleaseId>
+    <BinaryReleaseId>345c8c0c87a97c1595f9c8b</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>
diff --git a/LLama/LLamaWeights.cs b/LLama/LLamaWeights.cs
@@ -1,4 +1,4 @@
-﻿using System;
+using System;
 using System.Collections.Generic;
 using System.Text;
 using System.Threading;
@@ -72,17 +72,6 @@ public static LLamaWeights LoadFromFile(IModelParams @params)
         {
             using var pin = @params.ToLlamaModelParams(out var lparams);
             var weights = SafeLlamaModelHandle.LoadFromFile(@params.ModelPath, lparams);
-
-            foreach (var adapter in @params.LoraAdapters)
-            {
-                if (string.IsNullOrEmpty(adapter.Path))
-                    continue;
-                if (adapter.Scale <= 0)
-                    continue;
-
-                weights.ApplyLoraFromFile(adapter.Path, adapter.Scale, @params.LoraBase);
-            }
-
             return new LLamaWeights(weights);
         }
 
@@ -100,14 +89,6 @@ public static async Task<LLamaWeights> LoadFromFileAsync(IModelParams @params, C
             // don't touch the @params object inside the task, it might be changed
             // externally! Save a copy of everything that we need later.
             var modelPath = @params.ModelPath;
-            var loraBase = @params.LoraBase;
-            var loraAdapters = @params.LoraAdapters.ToArray();
-
-            // Determine the range to report for model loading. llama.cpp reports 0-1, but we'll remap that into a
-            // slightly smaller range to allow some space for reporting LoRA loading too.
-            var modelLoadProgressRange = 1f;
-            if (loraAdapters.Length > 0)
-                modelLoadProgressRange = 0.9f;
 
             using (@params.ToLlamaModelParams(out var lparams))
             {
@@ -119,7 +100,7 @@ public static async Task<LLamaWeights> LoadFromFileAsync(IModelParams @params, C
                     lparams.progress_callback = (progress, ctx) =>
                     {
                         // Update the progress reporter (remapping the value into the smaller range).
-                        progressReporter?.Report(Math.Clamp(progress, 0, 1) * modelLoadProgressRange);
+                        progressReporter?.Report(Math.Clamp(progress, 0, 1));
 
                         // If the user set a callback in the model params, call that and see if we should cancel
                         if (internalCallback != null && !internalCallback(progress, ctx))
@@ -141,30 +122,6 @@ public static async Task<LLamaWeights> LoadFromFileAsync(IModelParams @params, C
                         // Load the model
                         var weights = SafeLlamaModelHandle.LoadFromFile(modelPath, lparams);
 
-                        // Apply the LoRA adapters
-                        for (var i = 0; i < loraAdapters.Length; i++)
-                        {
-                            // Interrupt applying LoRAs if the token is cancelled
-                            if (token.IsCancellationRequested)
-                            {
-                                weights.Dispose();
-                                token.ThrowIfCancellationRequested();
-                            }
-
-                            // Don't apply invalid adapters
-                            var adapter = loraAdapters[i];
-                            if (string.IsNullOrEmpty(adapter.Path))
-                                continue;
-                            if (adapter.Scale <= 0)
-                                continue;
-
-                            weights.ApplyLoraFromFile(adapter.Path, adapter.Scale, loraBase);
-
-                            // Report progress. Model loading reported progress from 0 -> 0.9, use
-                            // the last 0.1 to represent all of the LoRA adapters being applied.
-                            progressReporter?.Report(0.9f + (0.1f / loraAdapters.Length) * (i + 1));
-                        }
-
                         // Update progress reporter to indicate completion
                         progressReporter?.Report(1);
 
diff --git a/LLama/Native/LLamaAttentionType.cs b/LLama/Native/LLamaAttentionType.cs
@@ -1,5 +1,9 @@
 namespace LLama.Native;
 
+/// <summary>
+/// 
+/// </summary>
+/// <remarks>llama_attention_type</remarks>
 public enum LLamaAttentionType
 {
     Unspecified = -1,
diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs
@@ -3,6 +3,7 @@ namespace LLama.Native
     /// <summary>
     /// Supported model file types
     /// </summary>
+    /// <remarks>C# representation of llama_ftype</remarks>
     public enum LLamaFtype
     {
         /// <summary>
@@ -35,10 +36,10 @@ public enum LLamaFtype
         /// <remarks>Benchmark@7B: 3.90GB, +0.1846 ppl</remarks>
         MOSTLY_Q4_1 = 3,
 
-        /// <summary>
-        /// Mostly 4 bit, tok_embeddings.weight and output.weight are f16
-        /// </summary>
-        MOSTLY_Q4_1_SOME_F16 = 4,
+        ///// <summary>
+        ///// Mostly 4 bit, tok_embeddings.weight and output.weight are f16
+        ///// </summary>
+        //MOSTLY_Q4_1_SOME_F16 = 4,
 
         /// <summary>
         /// Mostly 5 bit
diff --git a/LLama/Native/LLamaTokenDataArray.cs b/LLama/Native/LLamaTokenDataArray.cs
@@ -97,7 +97,7 @@ public void ApplyGrammar(SafeLLamaContextHandle ctx, SafeLLamaGrammarHandle? gra
 
             using (LLamaTokenDataArrayNative.Create(this, out var st))
             {
-                NativeApi.llama_sample_grammar(ctx, ref st, grammar);
+                NativeApi.llama_grammar_sample(grammar, ctx, ref st);
                 Sorted = st.sorted;
             }
         }
diff --git a/LLama/Native/LLamaVocabPreType.cs b/LLama/Native/LLamaVocabPreType.cs
@@ -27,4 +27,7 @@ internal enum LLamaVocabPreType
     CHATGLM4 = 17,
     VIKING = 18,
     JAIS = 19,
+    TEKKEN = 20,
+    SMOLLM = 21,
+    CODESHELL = 22,
 }
diff --git a/LLama/Native/LoraAdapter.cs b/LLama/Native/LoraAdapter.cs
@@ -0,0 +1,46 @@
+using System;
+
+namespace LLama.Native;
+
+/// <summary>
+/// A LoRA adapter which can be applied to a context for a specific model
+/// </summary>
+public class LoraAdapter
+{
+    /// <summary>
+    /// The model which this LoRA adapter was loaded with.
+    /// </summary>
+    public SafeLlamaModelHandle Model { get; }
+
+    /// <summary>
+    /// The full path of the file this adapter was loaded from
+    /// </summary>
+    public string Path { get; }
+
+    /// <summary>
+    /// Native pointer of the loaded adapter, will be automatically freed when the model is unloaded
+    /// </summary>
+    internal IntPtr Pointer { get; }
+
+    /// <summary>
+    /// Indicates if this adapter has been unloaded
+    /// </summary>
+    internal bool Loaded { get; private set; }
+
+    internal LoraAdapter(SafeLlamaModelHandle model, string path, IntPtr nativePtr)
+    {
+        Model = model;
+        Path = path;
+        Pointer = nativePtr;
+        Loaded = true;
+    }
+
+    /// <summary>
+    /// Unload this adapter
+    /// </summary>
+    public void Unload()
+    {
+        Loaded = false;
+        NativeApi.llama_lora_adapter_free(Pointer);
+    }
+}
diff --git a/LLama/Native/NativeApi.Grammar.cs b/LLama/Native/NativeApi.Grammar.cs
@@ -35,16 +35,16 @@ public static partial class NativeApi
 		/// <param name="ctx"></param>
 		/// <param name="candidates"></param>
 		/// <param name="grammar"></param>
-		[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_sample_grammar(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, SafeLLamaGrammarHandle grammar);
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern void llama_grammar_sample(SafeLLamaGrammarHandle grammar, SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates);
 
-		/// <summary>
-		/// Accepts the sampled token into the grammar
-		/// </summary>
-		/// <param name="ctx"></param>
-		/// <param name="grammar"></param>
-		/// <param name="token"></param>
-		[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_grammar_accept_token(SafeLLamaContextHandle ctx, SafeLLamaGrammarHandle grammar, LLamaToken token);
+/// <summary>
+        /// Accepts the sampled token into the grammar
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <param name="grammar"></param>
+        /// <param name="token"></param>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern void llama_grammar_accept_token(SafeLLamaGrammarHandle grammar, SafeLLamaContextHandle ctx, LLamaToken token);
 	}
 }
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
@@ -445,5 +445,12 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
         /// <returns>Returns the split_prefix length.</returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern int llama_split_prefix(string split_prefix, nuint maxlen, string split_path, int split_no, int split_count);
+
+        /// <summary>
+        /// Manually free a LoRA adapter. loaded adapters will be free when the associated model is deleted
+        /// </summary>
+        /// <param name="adapter"></param>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern void llama_lora_adapter_free(IntPtr adapter);
     }
 }
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
diff --git a/LLama/Native/SafeLLamaGrammarHandle.cs b/LLama/Native/SafeLLamaGrammarHandle.cs
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,9 @@`
`1`	`1`	`namespace LLama.Native;`
`2`	`2`
	`3`	`+/// <summary>`
	`4`	`+///`
	`5`	`+/// </summary>`
	`6`	`+/// <remarks>llama_attention_type</remarks>`
`3`	`7`	`public enum LLamaAttentionType`
`4`	`8`	`{`
`5`	`9`	`Unspecified = -1,`
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ public void ApplyGrammar(SafeLLamaContextHandle ctx, SafeLLamaGrammarHandle? gra`
`97`	`97`
`98`	`98`	`using (LLamaTokenDataArrayNative.Create(this, out var st))`
`99`	`99`	`{`
`100`		`- NativeApi.llama_sample_grammar(ctx, ref st, grammar);`
	`100`	`+ NativeApi.llama_grammar_sample(grammar, ctx, ref st);`
`101`	`101`	`Sorted = st.sorted;`
`102`	`102`	`}`
`103`	`103`	`}`
Original file line number	Diff line number	Diff line change
`@@ -27,4 +27,7 @@ internal enum LLamaVocabPreType`
`27`	`27`	`CHATGLM4 = 17,`
`28`	`28`	`VIKING = 18,`
`29`	`29`	`JAIS = 19,`
	`30`	`+ TEKKEN = 20,`
	`31`	`+ SMOLLM = 21,`
	`32`	`+ CODESHELL = 22,`
`30`	`33`	`}`
Original file line number	Diff line number	Diff line change
`@@ -445,5 +445,12 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)`
`445`	`445`	`/// <returns>Returns the split_prefix length.</returns>`
`446`	`446`	`[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]`
`447`	`447`	`public static extern int llama_split_prefix(string split_prefix, nuint maxlen, string split_path, int split_no, int split_count);`
	`448`	`+`
	`449`	`+ /// <summary>`
	`450`	`+ /// Manually free a LoRA adapter. loaded adapters will be free when the associated model is deleted`
	`451`	`+ /// </summary>`
	`452`	`+ /// <param name="adapter"></param>`
	`453`	`+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]`
	`454`	`+ public static extern void llama_lora_adapter_free(IntPtr adapter);`
`448`	`455`	`}`
`449`	`456`	`}`