diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index ccc013653..5dff9b7b0 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -160,12 +160,16 @@ jobs: include: - build: 'noavx' defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF' + arch: 'x64' - build: 'avx2' defines: '' + arch: 'x64' - build: 'avx' defines: '-DGGML_AVX2=OFF' + arch: 'x64' - build: 'avx512' defines: '-DGGML_AVX512=ON -DGGML_AVX512_VBMI=ON -DGGML_AVX512_VNNI=ON' + arch: 'x64' runs-on: windows-latest steps: - uses: actions/checkout@v4 @@ -187,31 +191,89 @@ jobs: uses: actions/upload-artifact@v4 with: path: .\build\bin\Release\llama.dll - name: llama-bin-win-${{ matrix.build }}-x64.dll + name: llama-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll if-no-files-found: error - name: Upload artifacts (ggml) uses: actions/upload-artifact@v4 with: path: .\build\bin\Release\ggml.dll - name: ggml-bin-win-${{ matrix.build }}-x64.dll + name: ggml-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll if-no-files-found: error - name: Upload artifacts (ggml-base) uses: actions/upload-artifact@v4 with: path: .\build\bin\Release\ggml-base.dll - name: ggml-base-bin-win-${{ matrix.build }}-x64.dll + name: ggml-base-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll if-no-files-found: error - name: Upload artifacts (ggml-cpu) uses: actions/upload-artifact@v4 with: path: .\build\bin\Release\ggml-cpu.dll - name: ggml-cpu-bin-win-${{ matrix.build }}-x64.dll + name: ggml-cpu-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll if-no-files-found: error - name: Upload artifacts (mtmd) uses: actions/upload-artifact@v4 with: - path: .\build\bin\Release\mtmd.dll - name: mtmd-bin-win-${{ matrix.build }}-x64.dll + path: .\build\bin\Release\llava_shared.dll + name: llava-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll + if-no-files-found: error + + compile-windows-arm64: + name: Compile (Windows ARM64) + strategy: + fail-fast: true + matrix: + include: + - build: 'arm64' + defines: '-DCMAKE_GENERATOR_PLATFORM=ARM64 -DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF' + arch: 'arm64' + runs-on: windows-latest + steps: + - uses: actions/checkout@v4 + with: + repository: ggerganov/llama.cpp + fetch-depth: 0 + ref: '${{ github.event.inputs.llama_cpp_commit }}' + + - name: Build + id: cmake_build + run: | + mkdir build + cd build + cmake .. ${{ env.COMMON_DEFINE }} ${{ matrix.defines }} + cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} + tree /f + + - name: Upload artifacts (llama) + uses: actions/upload-artifact@v4 + with: + path: .\build\bin\Release\llama.dll + name: llama-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll + if-no-files-found: error + - name: Upload artifacts (ggml) + uses: actions/upload-artifact@v4 + with: + path: .\build\bin\Release\ggml.dll + name: ggml-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll + if-no-files-found: error + - name: Upload artifacts (ggml-base) + uses: actions/upload-artifact@v4 + with: + path: .\build\bin\Release\ggml-base.dll + name: ggml-base-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll + if-no-files-found: error + - name: Upload artifacts (ggml-cpu) + uses: actions/upload-artifact@v4 + with: + path: .\build\bin\Release\ggml-cpu.dll + name: ggml-cpu-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll + if-no-files-found: error + + - name: Upload artifacts (llava) + uses: actions/upload-artifact@v4 + with: + path: .\build\bin\Release\llava_shared.dll + name: llava-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll if-no-files-found: error compile-vulkan: @@ -603,6 +665,7 @@ jobs: "compile-linux", "compile-musl", "compile-windows", + "compile-windows-arm64", "compile-vulkan", "compile-cublas", "compile-macos", @@ -617,7 +680,7 @@ jobs: - name: Rearrange Files run: | # Make all directories at once - mkdir --parents deps/{noavx,avx,avx2,avx512,linux-arm64,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu12.4.0,vulkan,android-arm64-v8a,android-x86,android-x86_64} + mkdir --parents deps/{noavx,avx,avx2,avx512,linux-arm64,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu11.7.1,cu12.2.0,vulkan,android-arm64-v8a,android-x86,android-x86_64,win-arm64} # Linux cp artifacts/ggml-bin-linux-noavx-x64.so/libggml.so deps/noavx/libggml.so @@ -701,6 +764,13 @@ jobs: cp artifacts/llama-bin-win-avx512-x64.dll/llama.dll deps/avx512/llama.dll cp artifacts/mtmd-bin-win-avx512-x64.dll/mtmd.dll deps/avx512/mtmd.dll + # Windows ARM64 + cp artifacts/ggml-bin-win-arm64-arm64.dll/ggml.dll deps/win-arm64/ggml.dll + cp artifacts/ggml-base-bin-win-arm64-arm64.dll/ggml-base.dll deps/win-arm64/ggml-base.dll + cp artifacts/ggml-cpu-bin-win-arm64-arm64.dll/ggml-cpu.dll deps/win-arm64/ggml-cpu.dll + cp artifacts/llama-bin-win-arm64-arm64.dll/llama.dll deps/win-arm64/llama.dll + cp artifacts/llava-bin-win-arm64-arm64.dll/llava_shared.dll deps/win-arm64/llava_shared.dll + # MacOS cp artifacts/ggml-bin-osx-arm64.dylib/libggml.dylib deps/osx-arm64/libggml.dylib cp artifacts/ggml-base-bin-osx-arm64.dylib/libggml-base.dylib deps/osx-arm64/libggml-base.dylib diff --git a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs index 33b399ec9..084821f0b 100644 --- a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs +++ b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs @@ -119,7 +119,7 @@ public void GlobalCleanup() { if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property { - Executor.Context.NativeHandle.KvCacheClear(); + Executor.Context.NativeHandle.MemoryClear(); } } diff --git a/LLama.Examples/Examples/BatchedExecutorSimple.cs b/LLama.Examples/Examples/BatchedExecutorSimple.cs index 5e532ff6a..9f8e6b6c7 100644 --- a/LLama.Examples/Examples/BatchedExecutorSimple.cs +++ b/LLama.Examples/Examples/BatchedExecutorSimple.cs @@ -97,8 +97,8 @@ await AnsiConsole.Live(table).StartAsync(async ctx => // A generic error, this is fatal and the batch can no longer be used. This should never occur and generally indicates // a bug in LLamaSharp, llama.cpp or a hardware error. - if (decodeResult == DecodeResult.Error) - throw new Exception("Unknown error occurred while inferring."); + if (decodeResult != DecodeResult.Ok) + throw new Exception($"Error occurred while inferring: {decodeResult}"); // After inference all of the conversations must be sampled before running inference again. foreach (var conversationData in conversations) diff --git a/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs b/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs index dc2dee06e..8cbf58dcd 100644 --- a/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs +++ b/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs @@ -79,7 +79,7 @@ public static async Task Run() // When the prompt contains images we clear KV_CACHE to restart conversation // See: // https://github.com/ggerganov/llama.cpp/discussions/3620 - ex.Context.NativeHandle.KvCacheRemove( LLamaSeqId.Zero, -1, -1 ); + ex.Context.NativeHandle.MemorySequenceRemove( LLamaSeqId.Zero, -1, -1 ); int index = 0; foreach (var path in imagePathsWithCurlyBraces) diff --git a/LLama.Unittest/LLavaWeightsTests.cs b/LLama.Unittest/LLavaWeightsTests.cs deleted file mode 100644 index 25a5f996a..000000000 --- a/LLama.Unittest/LLavaWeightsTests.cs +++ /dev/null @@ -1,54 +0,0 @@ -using LLama.Common; -using LLama.Native; - -namespace LLama.Unittest -{ - // Test the same things as llama model + image embedings - // - public sealed class LLavaWeightTests - : IDisposable - { - private readonly LLamaWeights _llamaWeights; - private readonly LLavaWeights _lLavaWeights; - private readonly LLamaContext _context; - - public LLavaWeightTests() - { - var @params = new ModelParams(Constants.LLavaModelPath) - { - // Llava models requires big context - ContextSize = 4096, - GpuLayerCount = Constants.CIGpuLayerCount, - }; - _llamaWeights = LLamaWeights.LoadFromFile(@params); - _lLavaWeights = LLavaWeights.LoadFromFile(Constants.LLavaMmpPath); - - _context = _llamaWeights.CreateContext(@params); - - } - - public void Dispose() - { - _llamaWeights.Dispose(); - _lLavaWeights.Dispose(); - } - - [Fact,Trait("Category", "NoCI")] - public void EmbedImageAsFileName() - { - int n_past = 0; - SafeLlavaImageEmbedHandle emb = _lLavaWeights.CreateImageEmbeddings(_context, Constants.LLavaImage); - Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) ); - } - - [Fact,Trait("Category", "NoCI")] - public void EmbedImageAsBinary() - { - int n_past = 0; - byte[] image = System.IO.File.ReadAllBytes(Constants.LLavaImage); - SafeLlavaImageEmbedHandle emb = _lLavaWeights.CreateImageEmbeddings(_context, image); - Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) ); - } - - } -} diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs index 9824c0922..c453aeddf 100644 --- a/LLama.Web/Common/ModelOptions.cs +++ b/LLama.Web/Common/ModelOptions.cs @@ -110,6 +110,15 @@ public class ModelOptions /// public bool VocabOnly { get; set; } + /// + public bool? OpOffload { get; set; } + + /// + public bool? SwaFull { get; set; } + + /// + public bool? KVUnified { get; set; } + /// public float? DefragThreshold { get; set; } diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs index cd18d5dbf..f80759c8a 100644 --- a/LLama/Abstractions/IContextParams.cs +++ b/LLama/Abstractions/IContextParams.cs @@ -109,8 +109,7 @@ public interface IContextParams bool FlashAttention { get; } /// - /// defragment the KV cache if holes/size > defrag_threshold, Set to < 0 to disable (default) - /// defragment the KV cache if holes/size > defrag_threshold, Set to or < 0 to disable (default) + /// defragment the KV cache if holes/size > defrag_threshold, Set to <= 0 to disable (default) /// float? DefragThreshold { get; } @@ -123,4 +122,25 @@ public interface IContextParams /// Attention type to use for embeddings /// LLamaAttentionType AttentionType { get; } + + /// + /// Offload host tensor operations to device + /// + bool? OpOffload { get; } + + /// + /// use a unified buffer across the input sequences when computing the attention. + /// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix + ///
+ /// ref: https://github.com/ggml-org/llama.cpp/pull/14363 + ///
+ bool? KVUnified { get; } + + /// + /// Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) + /// + /// Setting to false when n_seq_max > 1 can cause bad performance in some cases + /// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 + /// + bool? SwaFull { get; } } \ No newline at end of file diff --git a/LLama/Batched/Conversation.cs b/LLama/Batched/Conversation.cs index 7dbf1f8c3..fcc94ae8f 100644 --- a/LLama/Batched/Conversation.cs +++ b/LLama/Batched/Conversation.cs @@ -84,7 +84,7 @@ public void Dispose() _disposed = true; // Remove this conversation from the KV cache - Executor.Context.NativeHandle.KvCacheRemove(ConversationId, -1, -1); + Executor.Context.NativeHandle.MemorySequenceRemove(ConversationId, -1, -1); // Prevent finalizer from running GC.SuppressFinalize(this); @@ -129,7 +129,7 @@ public Conversation Fork() _forked = true; // Assign tokens to the new sequence - Executor.Context.NativeHandle.KvCacheSequenceCopy(ConversationId, c.ConversationId, 0, _end); + Executor.Context.NativeHandle.MemorySequenceCopy(ConversationId, c.ConversationId, 0, _end); return c; } @@ -406,7 +406,7 @@ internal KvAccessor(Conversation conversation) /// End position (exclusive) public void Remove(LLamaPos start, LLamaPos end) { - _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end); + _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end); } /// @@ -420,7 +420,7 @@ public void Remove(LLamaPos start, int count) return; var end = start.Value + count; - _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end); + _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end); } #endregion @@ -435,7 +435,7 @@ public void Remove(LLamaPos start, int count) /// Amount to add on to each token position public void Add(LLamaPos start, LLamaPos end, int delta) { - _conversation.Executor.Context.NativeHandle.KvCacheSequenceAdd(_conversation.ConversationId, start, end, delta); + _conversation.Executor.Context.NativeHandle.MemorySequenceAdd(_conversation.ConversationId, start, end, delta); } #endregion @@ -452,7 +452,7 @@ public void Divide(LLamaPos start, LLamaPos end, int divisor) if (divisor <= 0) throw new ArgumentOutOfRangeException(nameof(divisor)); - _conversation.Executor.Context.NativeHandle.KvCacheSequenceDivide(_conversation.ConversationId, start, end, divisor); + _conversation.Executor.Context.NativeHandle.MemorySequenceDivide(_conversation.ConversationId, start, end, divisor); } #endregion } diff --git a/LLama/ChatSession.cs b/LLama/ChatSession.cs index bb1f91437..90119d4fe 100644 --- a/LLama/ChatSession.cs +++ b/LLama/ChatSession.cs @@ -199,7 +199,7 @@ public void LoadSession(SessionState state, bool loadTransforms = true) } if (state.ContextState is null) { - Executor.Context.NativeHandle.KvCacheClear(); + Executor.Context.NativeHandle.MemoryClear(); } else { diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs index 23f5681be..89737faa7 100644 --- a/LLama/Common/ModelParams.cs +++ b/LLama/Common/ModelParams.cs @@ -112,6 +112,15 @@ public record ModelParams /// public bool VocabOnly { get; set; } + /// + public bool? OpOffload { get; set; } + + /// + public bool? SwaFull { get; set; } + + /// + public bool? KVUnified { get; set; } + /// /// `Encoding` cannot be directly JSON serialized, instead store the name as a string which can /// diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs index 54dd9873b..85e40f7ad 100644 --- a/LLama/Extensions/IContextParamsExtensions.cs +++ b/LLama/Extensions/IContextParamsExtensions.cs @@ -55,6 +55,13 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo result.n_threads = Threads(@params.Threads); result.n_threads_batch = Threads(@params.BatchThreads); + + if (@params.SwaFull.HasValue) + result.swa_full = @params.SwaFull.Value; + if (@params.OpOffload.HasValue) + result.op_offload = @params.OpOffload.Value; + if (@params.KVUnified.HasValue) + result.kv_unified = @params.KVUnified.Value; } private static int Threads(int? value) diff --git a/LLama/LLamaExecutorBase.cs b/LLama/LLamaExecutorBase.cs index 995cb3e4e..36989006e 100644 --- a/LLama/LLamaExecutorBase.cs +++ b/LLama/LLamaExecutorBase.cs @@ -128,7 +128,8 @@ public StatefulExecutorBase WithSessionFile(string filename) } if (File.Exists(filename)) { - _logger?.LogInformation($"[LLamaExecutor] Attempting to load saved session from {filename}"); + _logger?.LogInformation("[LLamaExecutor] Attempting to load saved session from {0}", filename); + var session_tokens = new LLamaToken[Context.ContextSize]; if (!NativeApi.llama_state_load_file(Context.NativeHandle, _pathSession, session_tokens, (ulong)Context.ContextSize, out var n_token_count_out)) { @@ -136,7 +137,7 @@ public StatefulExecutorBase WithSessionFile(string filename) throw new RuntimeError($"Failed to load session file {_pathSession}"); } _session_tokens = session_tokens.Take((int)n_token_count_out).ToList(); - _logger?.LogInformation($"[LLamaExecutor] Loaded a session with prompt size of {session_tokens.Length} tokens"); + _logger?.LogInformation("[LLamaExecutor] Loaded a session with prompt size of {0} tokens", session_tokens.Length); } else { @@ -190,11 +191,11 @@ protected virtual void HandleRunOutOfContext(int tokensToKeep) // if we run out of context: // - take the tokensToKeep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - tokensToKeep) tokens and recompute the logits in batches - int n_left = _pastTokensCount - tokensToKeep; - int n_discard = n_left / 2; + var n_left = _pastTokensCount - tokensToKeep; + var n_discard = n_left / 2; - NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard); - NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard); + Context.NativeHandle.MemorySequenceRemove(LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard); + Context.NativeHandle.MemorySequenceAdd(LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard); _pastTokensCount -= n_discard; // stop saving session if we run out of context diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs index fa42d7f35..16a206c40 100644 --- a/LLama/LLamaReranker.cs +++ b/LLama/LLamaReranker.cs @@ -114,7 +114,7 @@ public async Task> GetRelevanceScores(string input, IReadOn batch.Add(tokens[i], i, LLamaSeqId.Zero, true); // clear previous kv_cache values - Context.NativeHandle.KvCacheClear(); + Context.NativeHandle.MemoryClear(); // Check if we should cancel the work, just before doing anything expensive (encode/decode) cancellationToken.ThrowIfCancellationRequested(); @@ -144,7 +144,7 @@ public async Task> GetRelevanceScores(string input, IReadOn var score = Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero)[0]; - Context.NativeHandle.KvCacheClear(); + Context.NativeHandle.MemoryClear(); return (normalize ? Sigmoid(score) : score, tokens.Length); } @@ -155,7 +155,7 @@ private async Task> CalcRelevanceScores(LLamaBatch batch, b var seqNum = logicCap.Value + 1; List scores = new List(seqNum); // clear previous kv_cache values - Context.NativeHandle.KvCacheClear(); + Context.NativeHandle.MemoryClear(); // Check if we should cancel the work, just before doing anything expensive (encode/decode) cancellationToken.ThrowIfCancellationRequested(); @@ -189,7 +189,7 @@ private async Task> CalcRelevanceScores(LLamaBatch batch, b scores.Add(normalize ? Sigmoid(score) : score); } - Context.NativeHandle.KvCacheClear(); + Context.NativeHandle.MemoryClear(); return scores; } diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index 0f67303dc..e4fb7c89a 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -76,37 +76,19 @@ - - PreserveNewest - runtimes/win-x64/native/cuda11/llama.dll - - - PreserveNewest - runtimes/win-x64/native/cuda11/ggml-base.dll - - - PreserveNewest - runtimes/win-x64/native/cuda11/ggml.dll - - - PreserveNewest - runtimes/win-x64/native/cuda11/ggml-cuda.dll - - - - + PreserveNewest runtimes/win-x64/native/cuda12/llama.dll - + PreserveNewest runtimes/win-x64/native/cuda12/ggml-base.dll - + PreserveNewest runtimes/win-x64/native/cuda12/ggml.dll - + PreserveNewest runtimes/win-x64/native/cuda12/ggml-cuda.dll @@ -130,6 +112,29 @@ + + + PreserveNewest + runtimes/win-arm64/native/llama.dll + + + PreserveNewest + runtimes/win-arm64/native/ggml.dll + + + PreserveNewest + runtimes/win-arm64/native/ggml-base.dll + + + PreserveNewest + runtimes/win-arm64/native/ggml-cpu.dll + + + PreserveNewest + runtimes/win-arm64/native/llava_shared.dll + + + PreserveNewest runtimes/linux-x64/native/noavx/libllama.so @@ -218,43 +223,25 @@ PreserveNewest runtimes/linux-arm64/native/libggml-cpu.so - + PreserveNewest - runtimes/linux-arm64/native/libllava_shared.so + runtimes/linux-arm64/native/libmtmd.so - - PreserveNewest - runtimes/linux-x64/native/cuda11/libllama.so - - - PreserveNewest - runtimes/linux-x64/native/cuda11/libggml.so - - - PreserveNewest - runtimes/linux-x64/native/cuda11/libggml-base.so - - - PreserveNewest - runtimes/linux-x64/native/cuda11/libggml-cuda.so - - - - + PreserveNewest runtimes/linux-x64/native/cuda12/libllama.so - + PreserveNewest runtimes/linux-x64/native/cuda12/libggml.so - + PreserveNewest runtimes/linux-x64/native/cuda12/libggml-base.so - + PreserveNewest runtimes/linux-x64/native/cuda12/libggml-cuda.so @@ -371,9 +358,9 @@ PreserveNewest runtimes/osx-arm64/native/libllama.dylib - + PreserveNewest - runtimes/osx-arm64/native/libllava_shared.dylib + runtimes/osx-arm64/native/libmtmd.dylib PreserveNewest @@ -400,9 +387,9 @@ PreserveNewest runtimes/osx-x64/native/libllama.dylib - + PreserveNewest - runtimes/osx-x64/native/libllava_shared.dylib + runtimes/osx-x64/native/libmtmd.dylib @@ -425,67 +412,63 @@ PreserveNewest runtimes/osx-x64/native/rosetta2/libllama.dylib - + PreserveNewest - runtimes/osx-x64/native/rosetta2/libllava_shared.dylib + runtimes/osx-x64/native/rosetta2/libmtmd.dylib - + PreserveNewest - runtimes/win-x64/native/noavx/llava_shared.dll + runtimes/win-x64/native/noavx/libmtmd.dll - + PreserveNewest - runtimes/win-x64/native/avx/llava_shared.dll + runtimes/win-x64/native/avx/libmtmd.dll - + PreserveNewest - runtimes/win-x64/native/avx2/llava_shared.dll + runtimes/win-x64/native/avx2/libmtmd.dll - + PreserveNewest - runtimes/win-x64/native/avx512/llava_shared.dll + runtimes/win-x64/native/avx512/libmtmd.dll - + PreserveNewest - runtimes/win-x64/native/cuda11/llava_shared.dll + runtimes/win-x64/native/cuda12/libmtmd.dll - + PreserveNewest - runtimes/win-x64/native/cuda12/llava_shared.dll + runtimes/win-x64/native/vulkan/libmtmd.dll - + PreserveNewest - runtimes/win-x64/native/vulkan/llava_shared.dll + runtimes/win-arm64/native/libmtmd.dll - - PreserveNewest - runtimes/linux-x64/native/noavx/libllava_shared.so - - + PreserveNewest - runtimes/linux-x64/native/avx/libllava_shared.so + runtimes/linux-x64/native/noavx/libmtmd.so - + PreserveNewest - runtimes/linux-x64/native/avx2/libllava_shared.so + runtimes/linux-x64/native/avx/libmtmd.so - + PreserveNewest - runtimes/linux-x64/native/avx512/libllava_shared.so + runtimes/linux-x64/native/avx2/libmtmd.so - + PreserveNewest - runtimes/linux-x64/native/cuda11/libllava_shared.so + runtimes/linux-x64/native/avx512/libmtmd.so - + PreserveNewest - runtimes/linux-x64/native/cuda12/libllava_shared.so + runtimes/linux-x64/native/cuda12/libmtmd.so - + PreserveNewest - runtimes/linux-x64/native/vulkan/libllava_shared.so + runtimes/linux-x64/native/vulkan/libmtmd.so @@ -513,8 +496,8 @@ x86 - runtimes/android-x86/native/libllava_shared.so + Include="$(MSBuildThisFileDirectory)runtimes/deps/android-x86/libmtmd.so"> + runtimes/android-x86/native/libmtmd.so x86 @@ -542,8 +525,8 @@ x86_64 - lib/x86_64/libllava_shared.so + Include="$(MSBuildThisFileDirectory)runtimes/deps/android-x86_64/libmtmd.so"> + lib/x86_64/libmtmd.so x86_64 @@ -571,8 +554,8 @@ arm64-v8a - lib/arm64-v8a/libllava_shared.so + Include="$(MSBuildThisFileDirectory)runtimes/deps/android-arm64-v8a/libmtmd.so"> + lib/arm64-v8a/libmtmd.so arm64-v8a diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index 10476a121..15278427f 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -57,7 +57,7 @@ - ceda28ef8e310_v2 + 11dd5a44eb180e diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs index 817738895..8f9b40cc3 100644 --- a/LLama/LLamaStatelessExecutor.cs +++ b/LLama/LLamaStatelessExecutor.cs @@ -158,8 +158,8 @@ public async IAsyncEnumerable InferAsync(string prompt, IInferenceParams var n_left = n_past - tokensKeep; var n_discard = n_left / 2; - NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard); - NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard); + Context.NativeHandle.MemorySequenceRemove(LLamaSeqId.Zero, tokensKeep, tokensKeep + n_discard); + Context.NativeHandle.MemorySequenceAdd(LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard); n_past -= n_discard; } diff --git a/LLama/Native/DecodeResult.cs b/LLama/Native/DecodeResult.cs index 8bf72c046..b0548b43e 100644 --- a/LLama/Native/DecodeResult.cs +++ b/LLama/Native/DecodeResult.cs @@ -1,4 +1,4 @@ -namespace LLama.Native; +namespace LLama.Native; /// /// Return codes from llama_decode @@ -6,9 +6,9 @@ public enum DecodeResult { /// - /// An unspecified error + /// Input batch was invalid /// - Error = -1, + InvalidInputBatch = -1, /// /// Ok. diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs index 75b6be4bd..76f5d6c77 100644 --- a/LLama/Native/LLamaContextParams.cs +++ b/LLama/Native/LLamaContextParams.cs @@ -101,7 +101,7 @@ public struct LLamaContextParams public uint yarn_orig_ctx; /// - /// defragment the KV cache if holes/size > defrag_threshold, Set to < 0 to disable (default) + /// defragment the KV cache if holes/size > defrag_threshold, Set to <= 0 to disable (default) /// public float defrag_threshold; @@ -127,10 +127,17 @@ public struct LLamaContextParams /// public GGMLType type_v; + //todo: implement abort callback support + /// + /// ggml_abort_callback + /// + public IntPtr abort_callback; + + //todo: implement abort callback support /// - /// Deprecated! + /// User data passed into abort_callback /// - private sbyte _logits_all; + public IntPtr abort_callback_user_data; /// /// if true, extract embeddings (together with logits) @@ -172,17 +179,40 @@ public bool no_perf } private sbyte _no_perf; - //todo: implement abort callback support /// - /// ggml_abort_callback + /// offload host tensor operations to device /// - public IntPtr abort_callback; + public bool op_offload + { + readonly get => Convert.ToBoolean(_op_offload); + set => _op_offload = Convert.ToSByte(value); + } + private sbyte _op_offload; - //todo: implement abort callback support /// - /// User data passed into abort_callback + /// use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) + /// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases + /// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 /// - public IntPtr abort_callback_user_data; + public bool swa_full + { + readonly get => Convert.ToBoolean(_swa_full); + set => _swa_full = Convert.ToSByte(value); + } + private sbyte _swa_full; + + /// + /// use a unified buffer across the input sequences when computing the attention. + /// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix + ///
+ /// ref: https://github.com/ggml-org/llama.cpp/pull/14363 + ///
+ public bool kv_unified + { + readonly get => Convert.ToBoolean(_kv_unified); + set => _kv_unified = Convert.ToSByte(value); + } + private sbyte _kv_unified; /// /// Get the default LLamaContextParams diff --git a/LLama/Native/LLamaKvCache.cs b/LLama/Native/LLamaKvCache.cs deleted file mode 100644 index 4a402f9ed..000000000 --- a/LLama/Native/LLamaKvCache.cs +++ /dev/null @@ -1,10 +0,0 @@ -namespace LLama.Native; - -/// -/// C# representation of llama_kv_cache -/// -/// llama_kv_cache -internal struct LLamaKvCacheNative -{ - -} \ No newline at end of file diff --git a/LLama/Native/LLamaKvCacheView.cs b/LLama/Native/LLamaKvCacheView.cs deleted file mode 100644 index 2fa513324..000000000 --- a/LLama/Native/LLamaKvCacheView.cs +++ /dev/null @@ -1,241 +0,0 @@ -using System; - -namespace LLama.Native; - -/// -/// A safe handle for a LLamaKvCacheView -/// -public sealed class LLamaKvCacheViewSafeHandle - : SafeLLamaHandleBase -{ - private readonly SafeLLamaContextHandle _ctx; - private NativeLLamaKvCacheView _view; - - /// - /// Number of KV cache cells. This will be the same as the context size. - /// - public int CellCount => GetNativeView().n_cells; - - /// - /// Get the total number of tokens in the KV cache. - /// - /// For example, if there are two populated - /// cells, the first with 1 sequence id in it and the second with 2 sequence - /// ids then you'll have 3 tokens. - /// - public int TokenCount => GetNativeView().token_count; - - /// - /// Maximum number of sequences visible for a cell. There may be more sequences than this - /// in reality, this is simply the maximum number this view can see. - /// - public int MaxSequenceCount => GetNativeView().n_seq_max; - - /// - /// Number of populated cache cells - /// - public int UsedCellCount => GetNativeView().used_cells; - - /// - /// Maximum contiguous empty slots in the cache. - /// - public int MaxContiguous => GetNativeView().max_contiguous; - - /// - /// Index to the start of the MaxContiguous slot range. Can be negative when cache is full. - /// - public int MaxContiguousIdx => GetNativeView().max_contiguous; - - /// - /// Initialize a LLamaKvCacheViewSafeHandle which will call `llama_kv_cache_view_free` when disposed - /// - /// - /// - private LLamaKvCacheViewSafeHandle(SafeLLamaContextHandle ctx, NativeLLamaKvCacheView view) - : base((IntPtr)1, true) - { - _ctx = ctx; - _view = view; - } - - /// - /// Allocate a new KV cache view which can be used to inspect the KV cache - /// - /// - /// The maximum number of sequences visible in this view per cell - /// - public static LLamaKvCacheViewSafeHandle Allocate(SafeLLamaContextHandle ctx, int maxSequences) - { - // Allocate the view - var view = llama_kv_cache_view_init(ctx, maxSequences); - var handle = new LLamaKvCacheViewSafeHandle(ctx, view); - - // Update the view so it has valid data after allocation. - handle.Update(); - - return handle; - } - - /// - protected override bool ReleaseHandle() - { - llama_kv_cache_view_free(ref _view); - SetHandle(IntPtr.Zero); - - return true; - } - - /// - /// Read the current KV cache state into this view. - /// - public void Update() - { - llama_kv_cache_view_update(_ctx, ref _view); - } - - /// - /// Get the raw KV cache view - /// - /// - private ref NativeLLamaKvCacheView GetNativeView() - { - if (IsClosed) - throw new ObjectDisposedException("Cannot access LLamaKvCacheViewSafeHandle after is has been disposed"); - - return ref _view; - } - - /// - /// Get the cell at the given index - /// - /// The index of the cell [0, CellCount) - /// Data about the cell at the given index - /// Thrown if index is out of range (0 <= index < CellCount) - public LLamaPos GetCell(int index) - { - var view = GetNativeView(); - - if (index < 0) - throw new ArgumentOutOfRangeException(nameof(index), "Cell index must be >= 0"); - if (index >= view.n_cells) - throw new ArgumentOutOfRangeException(nameof(index), "Cell index must be < CellCount"); - - unsafe - { - return view.cells[index].pos; - } - } - - /// - /// Get all of the sequences assigned to the cell at the given index. This will contain entries - /// sequences even if the cell actually has more than that many sequences, allocate a new view with a larger maxSequences parameter - /// if necessary. Invalid sequences will be negative values. - /// - /// The index of the cell [0, CellCount) - /// A span containing the sequences assigned to this cell - /// Thrown if index is out of range (0 <= index < CellCount) - public Span GetCellSequences(int index) - { - var view = GetNativeView(); - - if (index < 0) - throw new ArgumentOutOfRangeException(nameof(index), "Cell index must be >= 0"); - if (index >= view.n_cells) - throw new ArgumentOutOfRangeException(nameof(index), "Cell index must be < CellCount"); - - unsafe - { - return new Span(&view.cells_sequences[index * view.n_seq_max], view.n_seq_max); - } - } - - #region native API - /// - /// Create an empty KV cache view. (use only for debugging purposes) - /// - /// - /// - /// - [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - private static extern NativeLLamaKvCacheView llama_kv_cache_view_init(SafeLLamaContextHandle ctx, int n_seq_max); - - /// - /// Free a KV cache view. (use only for debugging purposes) - /// - [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - private static extern void llama_kv_cache_view_free(ref NativeLLamaKvCacheView view); - - /// - /// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes) - /// - /// - /// - [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - private static extern void llama_kv_cache_view_update(SafeLLamaContextHandle ctx, ref NativeLLamaKvCacheView view); - - /// - /// Information associated with an individual cell in the KV cache view (llama_kv_cache_view_cell) - /// - [StructLayout(LayoutKind.Sequential)] - private struct NativeLLamaKvCacheViewCell - { - /// - /// The position for this cell. Takes KV cache shifts into account. - /// May be negative if the cell is not populated. - /// - public LLamaPos pos; - } - - /// - /// An updateable view of the KV cache (llama_kv_cache_view) - /// - [StructLayout(LayoutKind.Sequential)] - private unsafe struct NativeLLamaKvCacheView - { - /// - /// Number of KV cache cells. This will be the same as the context size. - /// - public int n_cells; - - /// - /// Maximum number of sequences that can exist in a cell. It's not an error - /// if there are more sequences in a cell than this value, however they will - /// not be visible in the view cells_sequences. - /// - public int n_seq_max; - - /// - /// Number of tokens in the cache. For example, if there are two populated - /// cells, the first with 1 sequence id in it and the second with 2 sequence - /// ids then you'll have 3 tokens. - /// - public int token_count; - - /// - /// Number of populated cache cells. - /// - public int used_cells; - - /// - /// Maximum contiguous empty slots in the cache. - /// - public int max_contiguous; - - /// - /// Index to the start of the max_contiguous slot range. Can be negative - /// when cache is full. - /// - public int max_contiguous_idx; - - /// - /// Information for an individual cell. - /// - public NativeLLamaKvCacheViewCell* cells; - - /// - /// The sequences for each cell. There will be n_seq_max items per cell. - /// - public LLamaSeqId* cells_sequences; - } - #endregion -} \ No newline at end of file diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs index d31b1bbc8..857f0cfb9 100644 --- a/LLama/Native/LLamaModelQuantizeParams.cs +++ b/LLama/Native/LLamaModelQuantizeParams.cs @@ -94,6 +94,11 @@ public bool keep_split /// public IntPtr tensor_types; + /// + /// Pointer to vector containing layer indices to prune + /// + public IntPtr prune_layers; + /// /// Create a LLamaModelQuantizeParams with default values /// diff --git a/LLama/Native/LLamaNativeBatch.cs b/LLama/Native/LLamaNativeBatch.cs index 41817604a..e65fb5000 100644 --- a/LLama/Native/LLamaNativeBatch.cs +++ b/LLama/Native/LLamaNativeBatch.cs @@ -1,7 +1,7 @@ namespace LLama.Native; /// -/// Input data for llama_decode +/// Input data for llama_encode/llama_decode /// A llama_batch object can contain input about one or many sequences /// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens /// @@ -25,7 +25,7 @@ public unsafe struct LLamaNativeBatch /// /// the positions of the respective token in the sequence - /// (if set to NULL, the token position will be tracked automatically by llama_decode) + /// (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode) /// public LLamaPos* pos; @@ -41,8 +41,12 @@ public unsafe struct LLamaNativeBatch public LLamaSeqId** seq_id; /// - /// if zero, the logits for the respective token will not be output - /// (if set to NULL, only the logits for last token will be returned) + /// if zero, the logits for the respective token will not be output. + /// If set to NULL: + /// + /// If embeddings: all tokens are output + /// If not: only the last token is output + /// /// public byte* logits; } \ No newline at end of file diff --git a/LLama/Native/LLamaTimings.cs b/LLama/Native/LLamaTimings.cs index 25384cca4..24ab925e7 100644 --- a/LLama/Native/LLamaTimings.cs +++ b/LLama/Native/LLamaTimings.cs @@ -38,6 +38,11 @@ public struct LLamaPerfContextTimings /// number of eval calls ///
private int n_eval; + + /// + /// number of times a ggml compute graph had been reused + /// + private int n_reused; /// /// Timestamp when reset was last called diff --git a/LLama/Native/LLamaVocabNative.cs b/LLama/Native/LLamaVocabNative.cs index d4f990a81..05347aa4e 100644 --- a/LLama/Native/LLamaVocabNative.cs +++ b/LLama/Native/LLamaVocabNative.cs @@ -94,6 +94,14 @@ internal struct LLamaVocabNative [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern unsafe LLamaToken llama_vocab_pad(LLamaVocabNative* vocab); + /// + /// mask + /// + /// + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern unsafe LLamaToken llama_vocab_mask(LLamaVocabNative* vocab); + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern unsafe LLamaToken llama_vocab_fim_pre(LLamaVocabNative* vocab); @@ -119,4 +127,8 @@ internal struct LLamaVocabNative [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] [return: MarshalAs(UnmanagedType.U1)] public static extern unsafe bool llama_vocab_get_add_eos(LLamaVocabNative* vocab); + + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + [return: MarshalAs(UnmanagedType.U1)] + public static extern unsafe bool llama_vocab_get_add_sep(LLamaVocabNative* vocab); } \ No newline at end of file diff --git a/LLama/Native/LLamaVocabPreType.cs b/LLama/Native/LLamaVocabPreType.cs deleted file mode 100644 index 48ab5585b..000000000 --- a/LLama/Native/LLamaVocabPreType.cs +++ /dev/null @@ -1,47 +0,0 @@ -namespace LLama.Native; - -/// -/// -/// -/// llama_vocab_pre_type -// ReSharper disable InconsistentNaming -internal enum LLamaVocabPreType -{ - Default = 0, - - LLAMA3 = 1, - DEEPSEEK_LLM = 2, - DEEPSEEK_CODER = 3, - FALCON = 4, - MPT = 5, - STARCODER = 6, - GPT2 = 7, - REFACT = 8, - COMMAND_R = 9, - STABLELM2 = 10, - QWEN2 = 11, - OLMO = 12, - DBRX = 13, - SMAUG = 14, - PORO = 15, - CHATGLM3 = 16, - CHATGLM4 = 17, - VIKING = 18, - JAIS = 19, - TEKKEN = 20, - SMOLLM = 21, - CODESHELL = 22, - BLOOM = 23, - GPT3_FINNISH = 24, - EXAONE = 25, - CHAMELEON = 26, - MINERVA = 27, - DEEPSEEK3_LLM = 28, - GPT4O = 29, - SUPERBPE = 30, - TRILLION = 31, - BAILINGMOE = 32, - LLAMA4 = 33, - PIXTRAL = 34, -} -// ReSharper restore InconsistentNaming \ No newline at end of file diff --git a/LLama/Native/LLamaVocabType.cs b/LLama/Native/LLamaVocabType.cs index bd7d704d9..1b5c6b970 100644 --- a/LLama/Native/LLamaVocabType.cs +++ b/LLama/Native/LLamaVocabType.cs @@ -35,4 +35,9 @@ public enum LLamaVocabType /// RWKV tokenizer based on greedy tokenization /// RWKV = 5, + + /// + /// PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming + /// + PLAMO2 = 6 } \ No newline at end of file diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs index 9f6457cd1..9ec996a20 100644 --- a/LLama/Native/Load/NativeLibraryUtils.cs +++ b/LLama/Native/Load/NativeLibraryUtils.cs @@ -219,7 +219,9 @@ public static void GetPlatformPathParts(OSPlatform platform, out string os, out { if (platform == OSPlatform.Windows) { - os = "win-x64"; + os = System.Runtime.Intrinsics.Arm.ArmBase.Arm64.IsSupported + ? "win-arm64" + : "win-x64"; fileExtension = ".dll"; libPrefix = ""; return; diff --git a/LLama/Native/Load/NativeLibraryWithAvx.cs b/LLama/Native/Load/NativeLibraryWithAvx.cs index e6cbd86f3..3296fac0f 100644 --- a/LLama/Native/Load/NativeLibraryWithAvx.cs +++ b/LLama/Native/Load/NativeLibraryWithAvx.cs @@ -50,7 +50,7 @@ public IEnumerable Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaL private string? GetAvxPath(SystemInfo systemInfo, AvxLevel avxLevel, NativeLogConfig.LLamaLogCallback? logCallback) { NativeLibraryUtils.GetPlatformPathParts(systemInfo.OSPlatform, out var os, out var fileExtension, out var libPrefix); - if (os != "linux-arm64"){ + if (os != "linux-arm64" && os != "win-arm64"){ var avxStr = NativeLibraryConfig.AvxLevelToString(avxLevel); if (!string.IsNullOrEmpty(avxStr)) avxStr += "/"; diff --git a/LLama/Native/NativeApi.Memory.cs b/LLama/Native/NativeApi.Memory.cs new file mode 100644 index 000000000..24a406ab2 --- /dev/null +++ b/LLama/Native/NativeApi.Memory.cs @@ -0,0 +1,104 @@ +using System; + +namespace LLama.Native; + +public static partial class NativeApi +{ + /// + /// Clear the memory contents. If data == true, the data buffers will also be cleared together with the metadata + /// + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern void llama_memory_clear(IntPtr /* llama_memory_t */ mem, [MarshalAs(UnmanagedType.U1)] bool data); + + /// + /// Removes all tokens that belong to the specified sequence and have positions in [p0, p1) + /// + /// + /// + /// + /// + /// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + [return: MarshalAs(UnmanagedType.U1)] + public static extern bool llama_memory_seq_rm(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq, LLamaPos p0, LLamaPos p1); + + /// + /// Copy all tokens that belong to the specified sequence to another sequence + /// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence + /// + /// + /// + /// + /// p0 < 0 : [0, p1] + /// p1 < 0 : [p0, inf) + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + internal static extern void llama_memory_seq_cp(IntPtr /* llama_memory_t */ mem, LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1); + + /// + /// Removes all tokens that do not belong to the specified sequence + /// + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + internal static extern void llama_memory_seq_keep(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq); + + /// + /// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) + /// + /// + /// + /// p0 < 0 : [0, p1] + /// p1 < 0 : [p0, inf) + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + internal static extern void llama_memory_seq_add(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta); + + /// + /// Integer division of the positions by factor of `d > 1` + ///
+ /// p0 < 0 : [0, p1] + ///
+ /// p1 < 0 : [p0, inf) + ///
+ /// + /// + /// p0 < 0 : [0, p1] + /// p1 < 0 : [p0, inf) + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + internal static extern void llama_memory_seq_div(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d); + + /// + /// Returns the smallest position present in the memory for the specified sequence. + /// This is typically non-zero only for SWA caches. + /// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory. + /// Return -1 if the sequence is empty. + /// + /// + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + internal static extern LLamaPos llama_memory_seq_pos_min(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq); + + /// + /// Returns the largest position present in the memory for the specified sequence. + /// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory. + /// Return -1 if the sequence is empty. + /// + /// + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + internal static extern LLamaPos llama_memory_seq_pos_max(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq); + + /// + /// Check if the memory supports shifting + /// + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + [return: MarshalAs(UnmanagedType.U1)] + internal static extern bool llama_memory_can_shift(IntPtr /* llama_memory_t */ mem); +} \ No newline at end of file diff --git a/LLama/Native/NativeApi.Training.cs b/LLama/Native/NativeApi.Training.cs new file mode 100644 index 000000000..ea1370b57 --- /dev/null +++ b/LLama/Native/NativeApi.Training.cs @@ -0,0 +1,35 @@ +namespace LLama.Native; + +public static partial class NativeApi +{ + ///// + ///// function that returns whether or not a given tensor contains trainable parameters + ///// + ///// + ///// + ///// + //[return: MarshalAs(UnmanagedType.U1)] + //private unsafe delegate bool llama_opt_param_filter(void* ggml_tensor, void* userdata); + + //private unsafe struct llama_opt_params + //{ + // uint n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0 + + // llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters + // void* param_filter_ud; // userdata for determining which tensors contain trainable parameters + + // ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters + + // void* get_opt_pars_ud; // userdata for calculating optimizer parameters + //}; + + //internal static extern void llama_opt_init(SafeLLamaContextHandle ctx, SafeLLamaContextHandle model, llama_opt_params @params); + + //internal static extern void llama_opt_epoch(SafeLLamaContextHandle ct, + // ggml_opt_dataset_t dataset, + // ggml_opt_result_t result_train, + // ggml_opt_result_t result_eval, + // int64_t idata_split, + // ggml_opt_epoch_callback callback_train, + // ggml_opt_epoch_callback callback_eval); +} \ No newline at end of file diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index 87cf02c78..db9e928bd 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -32,6 +32,13 @@ public static void llama_empty_call() [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern long llama_max_devices(); + /// + /// Maximum number of parallel sequences + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern long llama_max_parallel_sequences(); + /// /// Check if memory mapping is supported /// @@ -125,7 +132,7 @@ public static void llama_empty_call() public static extern void llama_set_causal_attn(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool causalAttn); /// - /// Set whether the model is in embeddings mode or not. + /// Set whether the context outputs embeddings or not /// /// /// If true, embeddings will be returned but logits will not @@ -237,7 +244,7 @@ public static int llama_token_to_piece(SafeLlamaModelHandle.Vocabulary vocab, LL /// add_special Allow to add BOS and EOS tokens if model is configured to do so. /// Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space. /// Returns the number of tokens on success, no more than n_max_tokens. - /// Returns a negative number on failure - the number of tokens that would have been returned + /// Returns a negative number on failure - the number of tokens that would have been returned. Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit) /// [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] internal static extern unsafe int llama_tokenize(LLamaVocabNative* model, byte* text, int text_len, LLamaToken* tokens, int n_max_tokens, [MarshalAs(UnmanagedType.U1)] bool add_special, [MarshalAs(UnmanagedType.U1)] bool parse_special); @@ -266,111 +273,6 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback) NativeLogConfig.llama_log_set(logCallback); } - /// - /// Returns the number of tokens in the KV cache (slow, use only for debug) - /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times - /// - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - internal static extern int llama_kv_self_n_tokens(SafeLLamaContextHandle ctx); - - /// - /// Returns the number of used KV cells (i.e. have at least one sequence assigned to them) - /// - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - internal static extern int llama_kv_self_used_cells(SafeLLamaContextHandle ctx); - - /// - /// Clear the KV cache. Both cell info is erased and KV data is zeroed - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - internal static extern void llama_kv_self_clear(SafeLLamaContextHandle ctx); - - [Obsolete("Use `llama_kv_self_clear` instead")] - /// - /// Clear the KV cache. Both cell info is erased and KV data is zeroed - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - internal static extern void llama_kv_cache_clear(SafeLLamaContextHandle ctx); - - /// - /// Removes all tokens that belong to the specified sequence and have positions in [p0, p1) - /// - /// - /// - /// - /// - /// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - [return: MarshalAs(UnmanagedType.U1)] - public static extern bool llama_kv_self_seq_rm(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1); - - /// - /// Copy all tokens that belong to the specified sequence to another sequence - /// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence - /// - /// - /// - /// - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - internal static extern void llama_kv_self_seq_cp(SafeLLamaContextHandle ctx, LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1); - - /// - /// Removes all tokens that do not belong to the specified sequence - /// - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - internal static extern void llama_kv_self_seq_keep(SafeLLamaContextHandle ctx, LLamaSeqId seq); - - /// - /// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) - /// If the KV cache is RoPEd, the KV data is updated accordingly: - /// - lazily on next llama_decode() - /// - explicitly with llama_kv_self_update() - /// - /// - /// - /// - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - internal static extern void llama_kv_self_seq_add(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta); - - /// - /// Integer division of the positions by factor of `d > 1` - /// If the KV cache is RoPEd, the KV data is updated accordingly: - /// - lazily on next llama_decode() - /// - explicitly with llama_kv_self_update() - ///
- /// p0 < 0 : [0, p1] - ///
- /// p1 < 0 : [p0, inf) - ///
- /// - /// - /// - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - internal static extern void llama_kv_self_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d); - - /// - /// Returns the largest position present in the KV cache for the specified sequence - /// - /// - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - internal static extern LLamaPos llama_kv_self_seq_pos_max(SafeLLamaContextHandle ctx, LLamaSeqId seq); - /// /// Allocates a batch of tokens on the heap /// Each token can be assigned up to n_seq_max sequence ids diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 467dd98e7..e26619b26 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -147,7 +147,9 @@ static SafeLLamaContextHandle() /// /// [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + // ReSharper disable InconsistentNaming private static extern unsafe void llama_set_abort_callback(SafeLLamaContextHandle ctx, GgmlAbortCallback abort_callback, void* abort_callback_data); + // ReSharper restore InconsistentNaming /// /// If this returns true computation is cancelled @@ -157,20 +159,27 @@ static SafeLLamaContextHandle() private unsafe delegate bool GgmlAbortCallback(void* data); /// + /// Process a batch of tokens. + /// Requires the context to have a memory. + /// For encode-decoder contexts, processes the batch using the decoder. + /// Positive return values does not mean a fatal error, but rather a warning. + /// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context + /// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max() + /// Upon other return values, the memory state is restored to the state before this call + /// 0 - success + /// 1 - could not find a memory slot for the batch (try reducing the size of the batch or increase the context) + /// 2 - aborted (processed ubatches will remain in the context's memory) + /// -1 - invalid input batch + /// < -1 - fatal error (processed ubatches will remain in the context's memory) /// - /// - /// - /// Positive return values does not mean a fatal error, but rather a warning:
- /// - 0: success
- /// - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
- /// - < 0: error
- ///
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern int llama_decode(SafeLLamaContextHandle ctx, LLamaNativeBatch batch); /// - /// Processes a batch of tokens with the encoder part of the encoder-decoder model. Stores the encoder output - /// internally for later use by the decoder cross-attention layers. + /// Process a batch of tokens. + /// In contrast to llama_decode() - this call does not use KV cache. + /// For encode-decoder contexts, processes the batch using the encoder. + /// Can store the encoder output internally for later use by the decoder's cross-attention layers. /// /// /// @@ -186,7 +195,9 @@ static SafeLLamaContextHandle() /// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) /// [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + // ReSharper disable InconsistentNaming private static extern void llama_set_n_threads(SafeLLamaContextHandle ctx, int n_threads, int n_threads_batch); + // ReSharper restore InconsistentNaming /// /// Get the number of threads used for generation of a single token. @@ -250,7 +261,7 @@ static SafeLLamaContextHandle() private static extern uint llama_n_ubatch(SafeLLamaContextHandle ctx); /// - /// Returns the **actual** size in bytes of the state (logits, embedding and kv_cache). + /// Returns the **actual** size in bytes of the state (logits, embedding and memory). /// Only use when saving the state, not when restoring it, otherwise the size may be too small. /// /// @@ -280,13 +291,13 @@ static SafeLLamaContextHandle() private static extern unsafe nuint llama_state_set_data(SafeLLamaContextHandle ctx, byte* src, nuint size); /// - /// Get the exact size needed to copy the KV cache of a single sequence + /// Get the exact size needed to copy the state of a single sequence /// [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern nuint llama_state_seq_get_size(SafeLLamaContextHandle ctx, LLamaSeqId seqId); /// - /// Copy the KV cache of a single sequence into the specified buffer + /// Copy the state of a single sequence into the specified buffer /// /// /// @@ -310,31 +321,6 @@ static SafeLLamaContextHandle() [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern unsafe nuint llama_state_seq_set_data(SafeLLamaContextHandle ctx, byte* src, nuint size, LLamaSeqId destSeqId); - /// - /// Defragment the KV cache. This will be applied: - /// - lazily on next llama_decode() - /// - explicitly with llama_kv_self_update() - /// - /// - /// - [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - private static extern void llama_kv_self_defrag(SafeLLamaContextHandle ctx); - - /// - /// Apply the KV cache updates (such as K-shifts, defragmentation, etc.) - /// - /// - [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - private static extern void llama_kv_self_update(SafeLLamaContextHandle ctx); - - /// - /// Check if the context supports KV cache shifting - /// - /// - /// - [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - private static extern bool llama_kv_self_can_shift(SafeLLamaContextHandle ctx); - [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern LLamaPerfContextTimings llama_perf_context(SafeLLamaContextHandle ctx); @@ -372,7 +358,7 @@ static SafeLLamaContextHandle() /// /// Get the embeddings for a sequence id. /// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE - /// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence + /// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence /// otherwise: float[n_embd] (1-dimensional) /// /// A pointer to the first float in an embedding, length = ctx.EmbeddingSize @@ -388,7 +374,7 @@ static SafeLLamaContextHandle() private static extern unsafe float* llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i); [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - private static extern LLamaKvCacheNative llama_get_kv_self(SafeLLamaContextHandle ctx); + private static extern IntPtr llama_get_memory(SafeLLamaContextHandle ctx); /// /// Set whether the model is in warmup mode or not @@ -580,7 +566,7 @@ public void Synchronize() /// internally for later use by the decoder cross-attention layers. /// /// - /// 0 = success
< 0 = error (the KV cache state is restored to the state before this call)
+ /// 0 = success
< 0 = error (the memory state is restored to the state before this call)
public DecodeResult Encode(LLamaBatch batch) { if (batch.TokenCount == 0) @@ -592,13 +578,19 @@ public DecodeResult Encode(LLamaBatch batch) } /// + /// Process a batch of tokens. + /// Requires the context to have a memory. + /// For encode-decoder contexts, processes the batch using the decoder. + /// Positive return values does not mean a fatal error, but rather a warning. + /// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context + /// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max() + /// Upon other return values, the memory state is restored to the state before this call + /// 0 - success + /// 1 - could not find a memory slot for the batch (try reducing the size of the batch or increase the context) + /// 2 - aborted (processed ubatches will remain in the context's memory) + /// -1 - invalid input batch + /// < -1 - fatal error (processed ubatches will remain in the context's memory) /// - /// - /// Positive return values does not mean a fatal error, but rather a warning:
- /// - 0: success
- /// - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
- /// - < 0: error (the KV cache state is restored to the state before this call)
- ///
public DecodeResult Decode(LLamaBatch batch) { if (batch.TokenCount == 0) @@ -617,6 +609,7 @@ public DecodeResult Decode(LLamaBatch batch) /// /// /// A tuple, containing the decode result and the number of tokens that have not been decoded yet. + // ReSharper disable once InconsistentNaming internal (DecodeResult, int) Decode(List tokens, LLamaSeqId id, LLamaBatch batch, ref int n_past) { if (tokens.Count == 0) @@ -645,15 +638,21 @@ public DecodeResult Decode(LLamaBatch batch) return (DecodeResult.Ok, 0); } - + /// + /// Process a batch of tokens. + /// Requires the context to have a memory. + /// For encode-decoder contexts, processes the batch using the decoder. + /// Positive return values does not mean a fatal error, but rather a warning. + /// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context + /// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max() + /// Upon other return values, the memory state is restored to the state before this call + /// 0 - success + /// 1 - could not find a memory slot for the batch (try reducing the size of the batch or increase the context) + /// 2 - aborted (processed ubatches will remain in the context's memory) + /// -1 - invalid input batch + /// < -1 - fatal error (processed ubatches will remain in the context's memory) /// - /// - /// Positive return values does not mean a fatal error, but rather a warning:
- /// - 0: success
- /// - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
- /// - < 0: error
- ///
public DecodeResult Decode(LLamaBatchEmbeddings batch) { if (batch.EmbeddingsCount == 0) @@ -675,7 +674,7 @@ public nuint GetStateSize() } /// - /// Get the size of the KV cache for a single sequence ID, when saved as bytes + /// Get the size of the memory state for a single sequence ID, when saved as bytes /// /// /// @@ -759,66 +758,20 @@ public void ResetTimings() } #endregion - #region KV Cache Management - /// - /// Check if the context supports KV cache shifting - /// - public bool KvCacheCanShift => llama_kv_self_can_shift(this); + #region Memory Management /// - /// Apply KV cache updates (such as K-shifts, defragmentation, etc.) + /// Check if the context supports memory shifting /// - public void KvCacheUpdate() - { - llama_kv_self_update(this); - } + public bool MemoryCanShift => NativeApi.llama_memory_can_shift(llama_get_memory(this)); /// - /// Defragment the KV cache. This will be applied: - /// - lazily on next llama_decode() - /// - explicitly with llama_kv_self_update() + /// Clear the memory /// - /// - public void KvCacheDefrag() - { - llama_kv_self_defrag(this); - } - - /// - /// Get a new KV cache view that can be used to debug the KV cache - /// - /// - /// - public LLamaKvCacheViewSafeHandle KvCacheGetDebugView(int maxSequences = 4) - { - return LLamaKvCacheViewSafeHandle.Allocate(this, maxSequences); - } - - /// - /// Count the number of used cells in the KV cache (i.e. have at least one sequence assigned to them) - /// - /// - public int KvCacheCountCells() + /// If true, the data buffers will also be cleared together with the metadata + public void MemoryClear(bool data = true) { - return NativeApi.llama_kv_self_used_cells(this); - } - - /// - /// Returns the number of tokens in the KV cache (slow, use only for debug) - /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times - /// - /// - public int KvCacheCountTokens() - { - return NativeApi.llama_kv_self_n_tokens(this); - } - - /// - /// Clear the KV cache - both cell info is erased and KV data is zeroed - /// - public void KvCacheClear() - { - NativeApi.llama_kv_self_clear(this); + NativeApi.llama_memory_clear(llama_get_memory(this), data); } /// @@ -827,54 +780,52 @@ public void KvCacheClear() /// /// /// - public void KvCacheRemove(LLamaSeqId seq, LLamaPos p0, LLamaPos p1) + public void MemorySequenceRemove(LLamaSeqId seq, LLamaPos p0, LLamaPos p1) { - NativeApi.llama_kv_self_seq_rm(this, seq, p0, p1); + NativeApi.llama_memory_seq_rm(llama_get_memory(this), seq, p0, p1); } /// /// Copy all tokens that belong to the specified sequence to another sequence. Note that - /// this does not allocate extra KV cache memory - it simply assigns the tokens to the + /// this does not allocate extra memory - it simply assigns the tokens to the /// new sequence /// /// /// /// /// - public void KvCacheSequenceCopy(LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1) + public void MemorySequenceCopy(LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1) { - NativeApi.llama_kv_self_seq_cp(this, src, dest, p0, p1); + NativeApi.llama_memory_seq_cp(llama_get_memory(this), src, dest, p0, p1); } /// /// Removes all tokens that do not belong to the specified sequence /// /// - public void KvCacheSequenceKeep(LLamaSeqId seq) + public void MemorySequenceKeep(LLamaSeqId seq) { - NativeApi.llama_kv_self_seq_keep(this, seq); + NativeApi.llama_memory_seq_keep(llama_get_memory(this), seq); } /// /// Adds relative position "delta" to all tokens that belong to the specified sequence - /// and have positions in [p0, p1. If the KV cache is RoPEd, the KV data is updated - /// accordingly + /// and have positions in [p0, p1) /// /// /// /// /// - public void KvCacheSequenceAdd(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta) + public void MemorySequenceAdd(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta) { - if (!KvCacheCanShift) - throw new InvalidOperationException("Cannot shift KV cache (KvCacheCanShift=False)"); + if (!MemoryCanShift) + throw new InvalidOperationException("Cannot shift memory (MemoryCanShift == false)"); - NativeApi.llama_kv_self_seq_add(this, seq, p0, p1, delta); + NativeApi.llama_memory_seq_add(llama_get_memory(this), seq, p0, p1, delta); } /// - /// Integer division of the positions by factor of `d > 1`. - /// If the KV cache is RoPEd, the KV data is updated accordingly.
+ /// Integer division of the positions by factor of `d > 1`.
/// p0 < 0 : [0, p1]
/// p1 < 0 : [p0, inf) ///
@@ -882,22 +833,32 @@ public void KvCacheSequenceAdd(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int del /// /// /// - public void KvCacheSequenceDivide(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int divisor) + public void MemorySequenceDivide(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int divisor) { - if (!KvCacheCanShift) - throw new InvalidOperationException("Cannot shift KV cache (KvCacheCanShift=False)"); + if (!MemoryCanShift) + throw new InvalidOperationException("Cannot shift memory (MemoryCanShift == false)"); - NativeApi.llama_kv_self_seq_div(this, seq, p0, p1, divisor); + NativeApi.llama_memory_seq_add(llama_get_memory(this), seq, p0, p1, divisor); + } + + /// + /// Returns the smallest position present in memory for the specified sequence + /// + /// + /// + public LLamaPos MemorySequenceMinPosition(LLamaSeqId seq) + { + return NativeApi.llama_memory_seq_pos_min(llama_get_memory(this), seq); } /// - /// Returns the largest position present in the KV cache for the specified sequence + /// Returns the largest position present in memory for the specified sequence /// /// /// - public LLamaPos KvCacheMaxPosition(LLamaSeqId seq) + public LLamaPos MemorySequenceMaxPosition(LLamaSeqId seq) { - return NativeApi.llama_kv_self_seq_pos_max(this, seq); + return NativeApi.llama_memory_seq_pos_max(llama_get_memory(this), seq); } #endregion } diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index 16336f706..d335a1209 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -2,6 +2,7 @@ using System.Collections.Generic; using System.Diagnostics; using System.IO; +using System.Runtime.CompilerServices; using System.Text; using CommunityToolkit.HighPerformance.Buffers; using LLama.Exceptions; @@ -58,7 +59,12 @@ public sealed class SafeLlamaModelHandle /// /// Get the number of KV heads in this model /// - public int KVHeadCount => llama_model_n_head(this); + public int KVHeadCount => llama_model_n_head_kv(this); + + /// + /// Get the number of SWA in this model + /// + public int SWACount => llama_model_n_swa(this); /// /// Returns true if the model contains an encoder that requires llama_encode() call @@ -140,6 +146,20 @@ public static SafeLlamaModelHandle LoadFromFile(string modelPath, LLamaModelPara return handle; } + /// + /// Save this model to a file + /// + /// + public void SaveToFile(string modelPath) + { + // If the file already exists, delete it. llama.cpp would overwrite, but doing this in C# has better errors in + // case of inaccessible/read-only files. + if (File.Exists(modelPath)) + File.Delete(modelPath); + + llama_model_save_to_file(this, modelPath); + } + #region native API static SafeLlamaModelHandle() { @@ -324,6 +344,14 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern int llama_model_n_head_kv(SafeLlamaModelHandle model); + /// + /// + /// + /// + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern int llama_model_n_swa(SafeLlamaModelHandle model); + /// /// Get a string describing the model type /// @@ -398,6 +426,25 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern unsafe LLamaVocabNative* llama_model_get_vocab(SafeLlamaModelHandle model); + + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern void llama_model_save_to_file(SafeLlamaModelHandle model, string path); + + /// + /// Returns the number of classifier outputs (only valid for classifier models) + /// Undefined behavior for non-classifier models + /// + /// + /// + private static extern uint llama_model_n_cls_out(SafeLlamaModelHandle model); + + /// + /// Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided + /// + /// + /// + /// + private static extern string? llama_model_cls_label(SafeLlamaModelHandle model, uint i); #endregion #region LoRA @@ -771,6 +818,20 @@ public LLamaToken? Pad } } + /// + /// Get the masking token for this model + /// + public LLamaToken? Mask + { + get + { + unsafe + { + return Normalize(LLamaVocabNative.llama_vocab_mask(VocabNative)); + } + } + } + /// /// Get the sentence separator token for this model /// diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec index aeef403eb..db2693270 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec @@ -42,6 +42,13 @@ + + + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec deleted file mode 100644 index 6abd16ccc..000000000 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec +++ /dev/null @@ -1,34 +0,0 @@ - - - - LLamaSharp.Backend.Cuda11.Linux - $version$ - LLamaSharp.Backend.Cuda11.Linux - llama.cpp Authors - false - MIT - icon512.png - https://github.com/SciSharp/LLamaSharp - LLamaSharp.Backend.Cuda11.Linux contains the Linux binaries for LLamaSharp with Cuda11 support. - - Copyright 2023 The llama.cpp Authors. All rights reserved. - LLamaSharp LLama LLM GPT AI ChatBot SciSharp - - - - - - - - - - - - - - - - - - - diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec deleted file mode 100644 index a412e2e6f..000000000 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec +++ /dev/null @@ -1,34 +0,0 @@ - - - - LLamaSharp.Backend.Cuda11.Windows - $version$ - LLamaSharp.Backend.Cuda11.Windows - llama.cpp Authors - false - MIT - icon512.png - https://github.com/SciSharp/LLamaSharp - LLamaSharp.Backend.Cuda11.Windows contains the Windows binaries for LLamaSharp with Cuda11 support. - - Copyright 2023 The llama.cpp Authors. All rights reserved. - LLamaSharp LLama LLM GPT AI ChatBot SciSharp - - - - - - - - - - - - - - - - - - - diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec deleted file mode 100644 index 5ac473914..000000000 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec +++ /dev/null @@ -1,28 +0,0 @@ - - - - LLamaSharp.Backend.Cuda11 - $version$ - LLamaSharp.Backend.Cuda11, the backend for LLamaSharp - llama.cpp Authors - false - MIT - icon512.png - https://github.com/SciSharp/LLamaSharp - LLamaSharp.Backend.Cuda11 is a backend for LLamaSharp to use with Cuda11. - - Copyright 2023 The llama.cpp Authors. All rights reserved. - LLamaSharp LLama LLM GPT AI ChatBot SciSharp - - - - - - - - - - - - - diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec index 687283221..b372f1e1d 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec @@ -22,12 +22,12 @@ - - - + + + - - + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec index 1fd01edb9..38c003236 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec @@ -22,12 +22,12 @@ - - - + + + - - + + diff --git a/llama.cpp b/llama.cpp index ceda28ef8..11dd5a44e 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit ceda28ef8e310a8dee60bf275077a3eedae8e36c +Subproject commit 11dd5a44eb180e1d69fac24d3852b5222d66fb7f