diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index ccc013653..5dff9b7b0 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -160,12 +160,16 @@ jobs:
include:
- build: 'noavx'
defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
+ arch: 'x64'
- build: 'avx2'
defines: ''
+ arch: 'x64'
- build: 'avx'
defines: '-DGGML_AVX2=OFF'
+ arch: 'x64'
- build: 'avx512'
defines: '-DGGML_AVX512=ON -DGGML_AVX512_VBMI=ON -DGGML_AVX512_VNNI=ON'
+ arch: 'x64'
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
@@ -187,31 +191,89 @@ jobs:
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\llama.dll
- name: llama-bin-win-${{ matrix.build }}-x64.dll
+ name: llama-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error
- name: Upload artifacts (ggml)
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\ggml.dll
- name: ggml-bin-win-${{ matrix.build }}-x64.dll
+ name: ggml-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error
- name: Upload artifacts (ggml-base)
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\ggml-base.dll
- name: ggml-base-bin-win-${{ matrix.build }}-x64.dll
+ name: ggml-base-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error
- name: Upload artifacts (ggml-cpu)
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\ggml-cpu.dll
- name: ggml-cpu-bin-win-${{ matrix.build }}-x64.dll
+ name: ggml-cpu-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error
- name: Upload artifacts (mtmd)
uses: actions/upload-artifact@v4
with:
- path: .\build\bin\Release\mtmd.dll
- name: mtmd-bin-win-${{ matrix.build }}-x64.dll
+ path: .\build\bin\Release\llava_shared.dll
+ name: llava-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+ if-no-files-found: error
+
+ compile-windows-arm64:
+ name: Compile (Windows ARM64)
+ strategy:
+ fail-fast: true
+ matrix:
+ include:
+ - build: 'arm64'
+ defines: '-DCMAKE_GENERATOR_PLATFORM=ARM64 -DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
+ arch: 'arm64'
+ runs-on: windows-latest
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ repository: ggerganov/llama.cpp
+ fetch-depth: 0
+ ref: '${{ github.event.inputs.llama_cpp_commit }}'
+
+ - name: Build
+ id: cmake_build
+ run: |
+ mkdir build
+ cd build
+ cmake .. ${{ env.COMMON_DEFINE }} ${{ matrix.defines }}
+ cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+ tree /f
+
+ - name: Upload artifacts (llama)
+ uses: actions/upload-artifact@v4
+ with:
+ path: .\build\bin\Release\llama.dll
+ name: llama-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+ if-no-files-found: error
+ - name: Upload artifacts (ggml)
+ uses: actions/upload-artifact@v4
+ with:
+ path: .\build\bin\Release\ggml.dll
+ name: ggml-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+ if-no-files-found: error
+ - name: Upload artifacts (ggml-base)
+ uses: actions/upload-artifact@v4
+ with:
+ path: .\build\bin\Release\ggml-base.dll
+ name: ggml-base-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+ if-no-files-found: error
+ - name: Upload artifacts (ggml-cpu)
+ uses: actions/upload-artifact@v4
+ with:
+ path: .\build\bin\Release\ggml-cpu.dll
+ name: ggml-cpu-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+ if-no-files-found: error
+
+ - name: Upload artifacts (llava)
+ uses: actions/upload-artifact@v4
+ with:
+ path: .\build\bin\Release\llava_shared.dll
+ name: llava-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error
compile-vulkan:
@@ -603,6 +665,7 @@ jobs:
"compile-linux",
"compile-musl",
"compile-windows",
+ "compile-windows-arm64",
"compile-vulkan",
"compile-cublas",
"compile-macos",
@@ -617,7 +680,7 @@ jobs:
- name: Rearrange Files
run: |
# Make all directories at once
- mkdir --parents deps/{noavx,avx,avx2,avx512,linux-arm64,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu12.4.0,vulkan,android-arm64-v8a,android-x86,android-x86_64}
+ mkdir --parents deps/{noavx,avx,avx2,avx512,linux-arm64,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu11.7.1,cu12.2.0,vulkan,android-arm64-v8a,android-x86,android-x86_64,win-arm64}
# Linux
cp artifacts/ggml-bin-linux-noavx-x64.so/libggml.so deps/noavx/libggml.so
@@ -701,6 +764,13 @@ jobs:
cp artifacts/llama-bin-win-avx512-x64.dll/llama.dll deps/avx512/llama.dll
cp artifacts/mtmd-bin-win-avx512-x64.dll/mtmd.dll deps/avx512/mtmd.dll
+ # Windows ARM64
+ cp artifacts/ggml-bin-win-arm64-arm64.dll/ggml.dll deps/win-arm64/ggml.dll
+ cp artifacts/ggml-base-bin-win-arm64-arm64.dll/ggml-base.dll deps/win-arm64/ggml-base.dll
+ cp artifacts/ggml-cpu-bin-win-arm64-arm64.dll/ggml-cpu.dll deps/win-arm64/ggml-cpu.dll
+ cp artifacts/llama-bin-win-arm64-arm64.dll/llama.dll deps/win-arm64/llama.dll
+ cp artifacts/llava-bin-win-arm64-arm64.dll/llava_shared.dll deps/win-arm64/llava_shared.dll
+
# MacOS
cp artifacts/ggml-bin-osx-arm64.dylib/libggml.dylib deps/osx-arm64/libggml.dylib
cp artifacts/ggml-base-bin-osx-arm64.dylib/libggml-base.dylib deps/osx-arm64/libggml-base.dylib
diff --git a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
index 33b399ec9..084821f0b 100644
--- a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
+++ b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
@@ -119,7 +119,7 @@ public void GlobalCleanup()
{
if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
{
- Executor.Context.NativeHandle.KvCacheClear();
+ Executor.Context.NativeHandle.MemoryClear();
}
}
diff --git a/LLama.Examples/Examples/BatchedExecutorSimple.cs b/LLama.Examples/Examples/BatchedExecutorSimple.cs
index 5e532ff6a..9f8e6b6c7 100644
--- a/LLama.Examples/Examples/BatchedExecutorSimple.cs
+++ b/LLama.Examples/Examples/BatchedExecutorSimple.cs
@@ -97,8 +97,8 @@ await AnsiConsole.Live(table).StartAsync(async ctx =>
// A generic error, this is fatal and the batch can no longer be used. This should never occur and generally indicates
// a bug in LLamaSharp, llama.cpp or a hardware error.
- if (decodeResult == DecodeResult.Error)
- throw new Exception("Unknown error occurred while inferring.");
+ if (decodeResult != DecodeResult.Ok)
+ throw new Exception($"Error occurred while inferring: {decodeResult}");
// After inference all of the conversations must be sampled before running inference again.
foreach (var conversationData in conversations)
diff --git a/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs b/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
index dc2dee06e..8cbf58dcd 100644
--- a/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
+++ b/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
@@ -79,7 +79,7 @@ public static async Task Run()
// When the prompt contains images we clear KV_CACHE to restart conversation
// See:
// https://github.com/ggerganov/llama.cpp/discussions/3620
- ex.Context.NativeHandle.KvCacheRemove( LLamaSeqId.Zero, -1, -1 );
+ ex.Context.NativeHandle.MemorySequenceRemove( LLamaSeqId.Zero, -1, -1 );
int index = 0;
foreach (var path in imagePathsWithCurlyBraces)
diff --git a/LLama.Unittest/LLavaWeightsTests.cs b/LLama.Unittest/LLavaWeightsTests.cs
deleted file mode 100644
index 25a5f996a..000000000
--- a/LLama.Unittest/LLavaWeightsTests.cs
+++ /dev/null
@@ -1,54 +0,0 @@
-using LLama.Common;
-using LLama.Native;
-
-namespace LLama.Unittest
-{
- // Test the same things as llama model + image embedings
- //
- public sealed class LLavaWeightTests
- : IDisposable
- {
- private readonly LLamaWeights _llamaWeights;
- private readonly LLavaWeights _lLavaWeights;
- private readonly LLamaContext _context;
-
- public LLavaWeightTests()
- {
- var @params = new ModelParams(Constants.LLavaModelPath)
- {
- // Llava models requires big context
- ContextSize = 4096,
- GpuLayerCount = Constants.CIGpuLayerCount,
- };
- _llamaWeights = LLamaWeights.LoadFromFile(@params);
- _lLavaWeights = LLavaWeights.LoadFromFile(Constants.LLavaMmpPath);
-
- _context = _llamaWeights.CreateContext(@params);
-
- }
-
- public void Dispose()
- {
- _llamaWeights.Dispose();
- _lLavaWeights.Dispose();
- }
-
- [Fact,Trait("Category", "NoCI")]
- public void EmbedImageAsFileName()
- {
- int n_past = 0;
- SafeLlavaImageEmbedHandle emb = _lLavaWeights.CreateImageEmbeddings(_context, Constants.LLavaImage);
- Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) );
- }
-
- [Fact,Trait("Category", "NoCI")]
- public void EmbedImageAsBinary()
- {
- int n_past = 0;
- byte[] image = System.IO.File.ReadAllBytes(Constants.LLavaImage);
- SafeLlavaImageEmbedHandle emb = _lLavaWeights.CreateImageEmbeddings(_context, image);
- Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) );
- }
-
- }
-}
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
index 9824c0922..c453aeddf 100644
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -110,6 +110,15 @@ public class ModelOptions
///
public bool VocabOnly { get; set; }
+ ///
+ public bool? OpOffload { get; set; }
+
+ ///
+ public bool? SwaFull { get; set; }
+
+ ///
+ public bool? KVUnified { get; set; }
+
///
public float? DefragThreshold { get; set; }
diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
index cd18d5dbf..f80759c8a 100644
--- a/LLama/Abstractions/IContextParams.cs
+++ b/LLama/Abstractions/IContextParams.cs
@@ -109,8 +109,7 @@ public interface IContextParams
bool FlashAttention { get; }
///
- /// defragment the KV cache if holes/size > defrag_threshold, Set to < 0 to disable (default)
- /// defragment the KV cache if holes/size > defrag_threshold, Set to or < 0 to disable (default)
+ /// defragment the KV cache if holes/size > defrag_threshold, Set to <= 0 to disable (default)
///
float? DefragThreshold { get; }
@@ -123,4 +122,25 @@ public interface IContextParams
/// Attention type to use for embeddings
///
LLamaAttentionType AttentionType { get; }
+
+ ///
+ /// Offload host tensor operations to device
+ ///
+ bool? OpOffload { get; }
+
+ ///
+ /// use a unified buffer across the input sequences when computing the attention.
+ /// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
+ ///
+ /// ref: https://github.com/ggml-org/llama.cpp/pull/14363
+ ///
+ bool? KVUnified { get; }
+
+ ///
+ /// Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+ ///
+ /// Setting to false when n_seq_max > 1 can cause bad performance in some cases
+ /// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+ ///
+ bool? SwaFull { get; }
}
\ No newline at end of file
diff --git a/LLama/Batched/Conversation.cs b/LLama/Batched/Conversation.cs
index 7dbf1f8c3..fcc94ae8f 100644
--- a/LLama/Batched/Conversation.cs
+++ b/LLama/Batched/Conversation.cs
@@ -84,7 +84,7 @@ public void Dispose()
_disposed = true;
// Remove this conversation from the KV cache
- Executor.Context.NativeHandle.KvCacheRemove(ConversationId, -1, -1);
+ Executor.Context.NativeHandle.MemorySequenceRemove(ConversationId, -1, -1);
// Prevent finalizer from running
GC.SuppressFinalize(this);
@@ -129,7 +129,7 @@ public Conversation Fork()
_forked = true;
// Assign tokens to the new sequence
- Executor.Context.NativeHandle.KvCacheSequenceCopy(ConversationId, c.ConversationId, 0, _end);
+ Executor.Context.NativeHandle.MemorySequenceCopy(ConversationId, c.ConversationId, 0, _end);
return c;
}
@@ -406,7 +406,7 @@ internal KvAccessor(Conversation conversation)
/// End position (exclusive)
public void Remove(LLamaPos start, LLamaPos end)
{
- _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
+ _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
}
///
@@ -420,7 +420,7 @@ public void Remove(LLamaPos start, int count)
return;
var end = start.Value + count;
- _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
+ _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
}
#endregion
@@ -435,7 +435,7 @@ public void Remove(LLamaPos start, int count)
/// Amount to add on to each token position
public void Add(LLamaPos start, LLamaPos end, int delta)
{
- _conversation.Executor.Context.NativeHandle.KvCacheSequenceAdd(_conversation.ConversationId, start, end, delta);
+ _conversation.Executor.Context.NativeHandle.MemorySequenceAdd(_conversation.ConversationId, start, end, delta);
}
#endregion
@@ -452,7 +452,7 @@ public void Divide(LLamaPos start, LLamaPos end, int divisor)
if (divisor <= 0)
throw new ArgumentOutOfRangeException(nameof(divisor));
- _conversation.Executor.Context.NativeHandle.KvCacheSequenceDivide(_conversation.ConversationId, start, end, divisor);
+ _conversation.Executor.Context.NativeHandle.MemorySequenceDivide(_conversation.ConversationId, start, end, divisor);
}
#endregion
}
diff --git a/LLama/ChatSession.cs b/LLama/ChatSession.cs
index bb1f91437..90119d4fe 100644
--- a/LLama/ChatSession.cs
+++ b/LLama/ChatSession.cs
@@ -199,7 +199,7 @@ public void LoadSession(SessionState state, bool loadTransforms = true)
}
if (state.ContextState is null)
{
- Executor.Context.NativeHandle.KvCacheClear();
+ Executor.Context.NativeHandle.MemoryClear();
}
else
{
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
index 23f5681be..89737faa7 100644
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -112,6 +112,15 @@ public record ModelParams
///
public bool VocabOnly { get; set; }
+ ///
+ public bool? OpOffload { get; set; }
+
+ ///
+ public bool? SwaFull { get; set; }
+
+ ///
+ public bool? KVUnified { get; set; }
+
///
/// `Encoding` cannot be directly JSON serialized, instead store the name as a string which can
///
diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
index 54dd9873b..85e40f7ad 100644
--- a/LLama/Extensions/IContextParamsExtensions.cs
+++ b/LLama/Extensions/IContextParamsExtensions.cs
@@ -55,6 +55,13 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
result.n_threads = Threads(@params.Threads);
result.n_threads_batch = Threads(@params.BatchThreads);
+
+ if (@params.SwaFull.HasValue)
+ result.swa_full = @params.SwaFull.Value;
+ if (@params.OpOffload.HasValue)
+ result.op_offload = @params.OpOffload.Value;
+ if (@params.KVUnified.HasValue)
+ result.kv_unified = @params.KVUnified.Value;
}
private static int Threads(int? value)
diff --git a/LLama/LLamaExecutorBase.cs b/LLama/LLamaExecutorBase.cs
index 995cb3e4e..36989006e 100644
--- a/LLama/LLamaExecutorBase.cs
+++ b/LLama/LLamaExecutorBase.cs
@@ -128,7 +128,8 @@ public StatefulExecutorBase WithSessionFile(string filename)
}
if (File.Exists(filename))
{
- _logger?.LogInformation($"[LLamaExecutor] Attempting to load saved session from {filename}");
+ _logger?.LogInformation("[LLamaExecutor] Attempting to load saved session from {0}", filename);
+
var session_tokens = new LLamaToken[Context.ContextSize];
if (!NativeApi.llama_state_load_file(Context.NativeHandle, _pathSession, session_tokens, (ulong)Context.ContextSize, out var n_token_count_out))
{
@@ -136,7 +137,7 @@ public StatefulExecutorBase WithSessionFile(string filename)
throw new RuntimeError($"Failed to load session file {_pathSession}");
}
_session_tokens = session_tokens.Take((int)n_token_count_out).ToList();
- _logger?.LogInformation($"[LLamaExecutor] Loaded a session with prompt size of {session_tokens.Length} tokens");
+ _logger?.LogInformation("[LLamaExecutor] Loaded a session with prompt size of {0} tokens", session_tokens.Length);
}
else
{
@@ -190,11 +191,11 @@ protected virtual void HandleRunOutOfContext(int tokensToKeep)
// if we run out of context:
// - take the tokensToKeep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - tokensToKeep) tokens and recompute the logits in batches
- int n_left = _pastTokensCount - tokensToKeep;
- int n_discard = n_left / 2;
+ var n_left = _pastTokensCount - tokensToKeep;
+ var n_discard = n_left / 2;
- NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);
- NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
+ Context.NativeHandle.MemorySequenceRemove(LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);
+ Context.NativeHandle.MemorySequenceAdd(LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
_pastTokensCount -= n_discard;
// stop saving session if we run out of context
diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs
index fa42d7f35..16a206c40 100644
--- a/LLama/LLamaReranker.cs
+++ b/LLama/LLamaReranker.cs
@@ -114,7 +114,7 @@ public async Task> GetRelevanceScores(string input, IReadOn
batch.Add(tokens[i], i, LLamaSeqId.Zero, true);
// clear previous kv_cache values
- Context.NativeHandle.KvCacheClear();
+ Context.NativeHandle.MemoryClear();
// Check if we should cancel the work, just before doing anything expensive (encode/decode)
cancellationToken.ThrowIfCancellationRequested();
@@ -144,7 +144,7 @@ public async Task> GetRelevanceScores(string input, IReadOn
var score = Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero)[0];
- Context.NativeHandle.KvCacheClear();
+ Context.NativeHandle.MemoryClear();
return (normalize ? Sigmoid(score) : score, tokens.Length);
}
@@ -155,7 +155,7 @@ private async Task> CalcRelevanceScores(LLamaBatch batch, b
var seqNum = logicCap.Value + 1;
List scores = new List(seqNum);
// clear previous kv_cache values
- Context.NativeHandle.KvCacheClear();
+ Context.NativeHandle.MemoryClear();
// Check if we should cancel the work, just before doing anything expensive (encode/decode)
cancellationToken.ThrowIfCancellationRequested();
@@ -189,7 +189,7 @@ private async Task> CalcRelevanceScores(LLamaBatch batch, b
scores.Add(normalize ? Sigmoid(score) : score);
}
- Context.NativeHandle.KvCacheClear();
+ Context.NativeHandle.MemoryClear();
return scores;
}
diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets
index 0f67303dc..e4fb7c89a 100644
--- a/LLama/LLamaSharp.Runtime.targets
+++ b/LLama/LLamaSharp.Runtime.targets
@@ -76,37 +76,19 @@
-
- PreserveNewest
- runtimes/win-x64/native/cuda11/llama.dll
-
-
- PreserveNewest
- runtimes/win-x64/native/cuda11/ggml-base.dll
-
-
- PreserveNewest
- runtimes/win-x64/native/cuda11/ggml.dll
-
-
- PreserveNewest
- runtimes/win-x64/native/cuda11/ggml-cuda.dll
-
-
-
-
+ PreserveNewest
runtimes/win-x64/native/cuda12/llama.dll
-
+ PreserveNewest
runtimes/win-x64/native/cuda12/ggml-base.dll
-
+ PreserveNewest
runtimes/win-x64/native/cuda12/ggml.dll
-
+ PreserveNewest
runtimes/win-x64/native/cuda12/ggml-cuda.dll
@@ -130,6 +112,29 @@
+
+
+ PreserveNewest
+ runtimes/win-arm64/native/llama.dll
+
+
+ PreserveNewest
+ runtimes/win-arm64/native/ggml.dll
+
+
+ PreserveNewest
+ runtimes/win-arm64/native/ggml-base.dll
+
+
+ PreserveNewest
+ runtimes/win-arm64/native/ggml-cpu.dll
+
+
+ PreserveNewest
+ runtimes/win-arm64/native/llava_shared.dll
+
+
+
PreserveNewest
runtimes/linux-x64/native/noavx/libllama.so
@@ -218,43 +223,25 @@
PreserveNewest
runtimes/linux-arm64/native/libggml-cpu.so
-
+ PreserveNewest
- runtimes/linux-arm64/native/libllava_shared.so
+ runtimes/linux-arm64/native/libmtmd.so
-
- PreserveNewest
- runtimes/linux-x64/native/cuda11/libllama.so
-
-
- PreserveNewest
- runtimes/linux-x64/native/cuda11/libggml.so
-
-
- PreserveNewest
- runtimes/linux-x64/native/cuda11/libggml-base.so
-
-
- PreserveNewest
- runtimes/linux-x64/native/cuda11/libggml-cuda.so
-
-
-
-
+ PreserveNewest
runtimes/linux-x64/native/cuda12/libllama.so
-
+ PreserveNewest
runtimes/linux-x64/native/cuda12/libggml.so
-
+ PreserveNewest
runtimes/linux-x64/native/cuda12/libggml-base.so
-
+ PreserveNewest
runtimes/linux-x64/native/cuda12/libggml-cuda.so
@@ -371,9 +358,9 @@
PreserveNewest
runtimes/osx-arm64/native/libllama.dylib
-
+ PreserveNewest
- runtimes/osx-arm64/native/libllava_shared.dylib
+ runtimes/osx-arm64/native/libmtmd.dylib
PreserveNewest
@@ -400,9 +387,9 @@
PreserveNewest
runtimes/osx-x64/native/libllama.dylib
-
+ PreserveNewest
- runtimes/osx-x64/native/libllava_shared.dylib
+ runtimes/osx-x64/native/libmtmd.dylib
@@ -425,67 +412,63 @@
PreserveNewest
runtimes/osx-x64/native/rosetta2/libllama.dylib
-
+ PreserveNewest
- runtimes/osx-x64/native/rosetta2/libllava_shared.dylib
+ runtimes/osx-x64/native/rosetta2/libmtmd.dylib
-
+ PreserveNewest
- runtimes/win-x64/native/noavx/llava_shared.dll
+ runtimes/win-x64/native/noavx/libmtmd.dll
-
+ PreserveNewest
- runtimes/win-x64/native/avx/llava_shared.dll
+ runtimes/win-x64/native/avx/libmtmd.dll
-
+ PreserveNewest
- runtimes/win-x64/native/avx2/llava_shared.dll
+ runtimes/win-x64/native/avx2/libmtmd.dll
-
+ PreserveNewest
- runtimes/win-x64/native/avx512/llava_shared.dll
+ runtimes/win-x64/native/avx512/libmtmd.dll
-
+ PreserveNewest
- runtimes/win-x64/native/cuda11/llava_shared.dll
+ runtimes/win-x64/native/cuda12/libmtmd.dll
-
+ PreserveNewest
- runtimes/win-x64/native/cuda12/llava_shared.dll
+ runtimes/win-x64/native/vulkan/libmtmd.dll
-
+ PreserveNewest
- runtimes/win-x64/native/vulkan/llava_shared.dll
+ runtimes/win-arm64/native/libmtmd.dll
-
- PreserveNewest
- runtimes/linux-x64/native/noavx/libllava_shared.so
-
-
+ PreserveNewest
- runtimes/linux-x64/native/avx/libllava_shared.so
+ runtimes/linux-x64/native/noavx/libmtmd.so
-
+ PreserveNewest
- runtimes/linux-x64/native/avx2/libllava_shared.so
+ runtimes/linux-x64/native/avx/libmtmd.so
-
+ PreserveNewest
- runtimes/linux-x64/native/avx512/libllava_shared.so
+ runtimes/linux-x64/native/avx2/libmtmd.so
-
+ PreserveNewest
- runtimes/linux-x64/native/cuda11/libllava_shared.so
+ runtimes/linux-x64/native/avx512/libmtmd.so
-
+ PreserveNewest
- runtimes/linux-x64/native/cuda12/libllava_shared.so
+ runtimes/linux-x64/native/cuda12/libmtmd.so
-
+ PreserveNewest
- runtimes/linux-x64/native/vulkan/libllava_shared.so
+ runtimes/linux-x64/native/vulkan/libmtmd.so
@@ -513,8 +496,8 @@
x86
- runtimes/android-x86/native/libllava_shared.so
+ Include="$(MSBuildThisFileDirectory)runtimes/deps/android-x86/libmtmd.so">
+ runtimes/android-x86/native/libmtmd.so
x86
@@ -542,8 +525,8 @@
x86_64
- lib/x86_64/libllava_shared.so
+ Include="$(MSBuildThisFileDirectory)runtimes/deps/android-x86_64/libmtmd.so">
+ lib/x86_64/libmtmd.so
x86_64
@@ -571,8 +554,8 @@
arm64-v8a
- lib/arm64-v8a/libllava_shared.so
+ Include="$(MSBuildThisFileDirectory)runtimes/deps/android-arm64-v8a/libmtmd.so">
+ lib/arm64-v8a/libmtmd.so
arm64-v8a
diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
index 10476a121..15278427f 100644
--- a/LLama/LLamaSharp.csproj
+++ b/LLama/LLamaSharp.csproj
@@ -57,7 +57,7 @@
- ceda28ef8e310_v2
+ 11dd5a44eb180e
diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs
index 817738895..8f9b40cc3 100644
--- a/LLama/LLamaStatelessExecutor.cs
+++ b/LLama/LLamaStatelessExecutor.cs
@@ -158,8 +158,8 @@ public async IAsyncEnumerable InferAsync(string prompt, IInferenceParams
var n_left = n_past - tokensKeep;
var n_discard = n_left / 2;
- NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard);
- NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);
+ Context.NativeHandle.MemorySequenceRemove(LLamaSeqId.Zero, tokensKeep, tokensKeep + n_discard);
+ Context.NativeHandle.MemorySequenceAdd(LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);
n_past -= n_discard;
}
diff --git a/LLama/Native/DecodeResult.cs b/LLama/Native/DecodeResult.cs
index 8bf72c046..b0548b43e 100644
--- a/LLama/Native/DecodeResult.cs
+++ b/LLama/Native/DecodeResult.cs
@@ -1,4 +1,4 @@
-namespace LLama.Native;
+namespace LLama.Native;
///
/// Return codes from llama_decode
@@ -6,9 +6,9 @@
public enum DecodeResult
{
///
- /// An unspecified error
+ /// Input batch was invalid
///
- Error = -1,
+ InvalidInputBatch = -1,
///
/// Ok.
diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
index 75b6be4bd..76f5d6c77 100644
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -101,7 +101,7 @@ public struct LLamaContextParams
public uint yarn_orig_ctx;
///
- /// defragment the KV cache if holes/size > defrag_threshold, Set to < 0 to disable (default)
+ /// defragment the KV cache if holes/size > defrag_threshold, Set to <= 0 to disable (default)
///
public float defrag_threshold;
@@ -127,10 +127,17 @@ public struct LLamaContextParams
///
public GGMLType type_v;
+ //todo: implement abort callback support
+ ///
+ /// ggml_abort_callback
+ ///
+ public IntPtr abort_callback;
+
+ //todo: implement abort callback support
///
- /// Deprecated!
+ /// User data passed into abort_callback
///
- private sbyte _logits_all;
+ public IntPtr abort_callback_user_data;
///
/// if true, extract embeddings (together with logits)
@@ -172,17 +179,40 @@ public bool no_perf
}
private sbyte _no_perf;
- //todo: implement abort callback support
///
- /// ggml_abort_callback
+ /// offload host tensor operations to device
///
- public IntPtr abort_callback;
+ public bool op_offload
+ {
+ readonly get => Convert.ToBoolean(_op_offload);
+ set => _op_offload = Convert.ToSByte(value);
+ }
+ private sbyte _op_offload;
- //todo: implement abort callback support
///
- /// User data passed into abort_callback
+ /// use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+ /// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
+ /// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
///
- public IntPtr abort_callback_user_data;
+ public bool swa_full
+ {
+ readonly get => Convert.ToBoolean(_swa_full);
+ set => _swa_full = Convert.ToSByte(value);
+ }
+ private sbyte _swa_full;
+
+ ///
+ /// use a unified buffer across the input sequences when computing the attention.
+ /// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
+ ///
+ /// ref: https://github.com/ggml-org/llama.cpp/pull/14363
+ ///
+ public bool kv_unified
+ {
+ readonly get => Convert.ToBoolean(_kv_unified);
+ set => _kv_unified = Convert.ToSByte(value);
+ }
+ private sbyte _kv_unified;
///
/// Get the default LLamaContextParams
diff --git a/LLama/Native/LLamaKvCache.cs b/LLama/Native/LLamaKvCache.cs
deleted file mode 100644
index 4a402f9ed..000000000
--- a/LLama/Native/LLamaKvCache.cs
+++ /dev/null
@@ -1,10 +0,0 @@
-namespace LLama.Native;
-
-///
-/// C# representation of llama_kv_cache
-///
-/// llama_kv_cache
-internal struct LLamaKvCacheNative
-{
-
-}
\ No newline at end of file
diff --git a/LLama/Native/LLamaKvCacheView.cs b/LLama/Native/LLamaKvCacheView.cs
deleted file mode 100644
index 2fa513324..000000000
--- a/LLama/Native/LLamaKvCacheView.cs
+++ /dev/null
@@ -1,241 +0,0 @@
-using System;
-
-namespace LLama.Native;
-
-///
-/// A safe handle for a LLamaKvCacheView
-///
-public sealed class LLamaKvCacheViewSafeHandle
- : SafeLLamaHandleBase
-{
- private readonly SafeLLamaContextHandle _ctx;
- private NativeLLamaKvCacheView _view;
-
- ///
- /// Number of KV cache cells. This will be the same as the context size.
- ///
- public int CellCount => GetNativeView().n_cells;
-
- ///
- /// Get the total number of tokens in the KV cache.
- ///
- /// For example, if there are two populated
- /// cells, the first with 1 sequence id in it and the second with 2 sequence
- /// ids then you'll have 3 tokens.
- ///
- public int TokenCount => GetNativeView().token_count;
-
- ///
- /// Maximum number of sequences visible for a cell. There may be more sequences than this
- /// in reality, this is simply the maximum number this view can see.
- ///
- public int MaxSequenceCount => GetNativeView().n_seq_max;
-
- ///
- /// Number of populated cache cells
- ///
- public int UsedCellCount => GetNativeView().used_cells;
-
- ///
- /// Maximum contiguous empty slots in the cache.
- ///
- public int MaxContiguous => GetNativeView().max_contiguous;
-
- ///
- /// Index to the start of the MaxContiguous slot range. Can be negative when cache is full.
- ///
- public int MaxContiguousIdx => GetNativeView().max_contiguous;
-
- ///
- /// Initialize a LLamaKvCacheViewSafeHandle which will call `llama_kv_cache_view_free` when disposed
- ///
- ///
- ///
- private LLamaKvCacheViewSafeHandle(SafeLLamaContextHandle ctx, NativeLLamaKvCacheView view)
- : base((IntPtr)1, true)
- {
- _ctx = ctx;
- _view = view;
- }
-
- ///
- /// Allocate a new KV cache view which can be used to inspect the KV cache
- ///
- ///
- /// The maximum number of sequences visible in this view per cell
- ///
- public static LLamaKvCacheViewSafeHandle Allocate(SafeLLamaContextHandle ctx, int maxSequences)
- {
- // Allocate the view
- var view = llama_kv_cache_view_init(ctx, maxSequences);
- var handle = new LLamaKvCacheViewSafeHandle(ctx, view);
-
- // Update the view so it has valid data after allocation.
- handle.Update();
-
- return handle;
- }
-
- ///
- protected override bool ReleaseHandle()
- {
- llama_kv_cache_view_free(ref _view);
- SetHandle(IntPtr.Zero);
-
- return true;
- }
-
- ///
- /// Read the current KV cache state into this view.
- ///
- public void Update()
- {
- llama_kv_cache_view_update(_ctx, ref _view);
- }
-
- ///
- /// Get the raw KV cache view
- ///
- ///
- private ref NativeLLamaKvCacheView GetNativeView()
- {
- if (IsClosed)
- throw new ObjectDisposedException("Cannot access LLamaKvCacheViewSafeHandle after is has been disposed");
-
- return ref _view;
- }
-
- ///
- /// Get the cell at the given index
- ///
- /// The index of the cell [0, CellCount)
- /// Data about the cell at the given index
- /// Thrown if index is out of range (0 <= index < CellCount)
- public LLamaPos GetCell(int index)
- {
- var view = GetNativeView();
-
- if (index < 0)
- throw new ArgumentOutOfRangeException(nameof(index), "Cell index must be >= 0");
- if (index >= view.n_cells)
- throw new ArgumentOutOfRangeException(nameof(index), "Cell index must be < CellCount");
-
- unsafe
- {
- return view.cells[index].pos;
- }
- }
-
- ///
- /// Get all of the sequences assigned to the cell at the given index. This will contain entries
- /// sequences even if the cell actually has more than that many sequences, allocate a new view with a larger maxSequences parameter
- /// if necessary. Invalid sequences will be negative values.
- ///
- /// The index of the cell [0, CellCount)
- /// A span containing the sequences assigned to this cell
- /// Thrown if index is out of range (0 <= index < CellCount)
- public Span GetCellSequences(int index)
- {
- var view = GetNativeView();
-
- if (index < 0)
- throw new ArgumentOutOfRangeException(nameof(index), "Cell index must be >= 0");
- if (index >= view.n_cells)
- throw new ArgumentOutOfRangeException(nameof(index), "Cell index must be < CellCount");
-
- unsafe
- {
- return new Span(&view.cells_sequences[index * view.n_seq_max], view.n_seq_max);
- }
- }
-
- #region native API
- ///
- /// Create an empty KV cache view. (use only for debugging purposes)
- ///
- ///
- ///
- ///
- [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- private static extern NativeLLamaKvCacheView llama_kv_cache_view_init(SafeLLamaContextHandle ctx, int n_seq_max);
-
- ///
- /// Free a KV cache view. (use only for debugging purposes)
- ///
- [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- private static extern void llama_kv_cache_view_free(ref NativeLLamaKvCacheView view);
-
- ///
- /// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
- ///
- ///
- ///
- [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- private static extern void llama_kv_cache_view_update(SafeLLamaContextHandle ctx, ref NativeLLamaKvCacheView view);
-
- ///
- /// Information associated with an individual cell in the KV cache view (llama_kv_cache_view_cell)
- ///
- [StructLayout(LayoutKind.Sequential)]
- private struct NativeLLamaKvCacheViewCell
- {
- ///
- /// The position for this cell. Takes KV cache shifts into account.
- /// May be negative if the cell is not populated.
- ///
- public LLamaPos pos;
- }
-
- ///
- /// An updateable view of the KV cache (llama_kv_cache_view)
- ///
- [StructLayout(LayoutKind.Sequential)]
- private unsafe struct NativeLLamaKvCacheView
- {
- ///
- /// Number of KV cache cells. This will be the same as the context size.
- ///
- public int n_cells;
-
- ///
- /// Maximum number of sequences that can exist in a cell. It's not an error
- /// if there are more sequences in a cell than this value, however they will
- /// not be visible in the view cells_sequences.
- ///
- public int n_seq_max;
-
- ///
- /// Number of tokens in the cache. For example, if there are two populated
- /// cells, the first with 1 sequence id in it and the second with 2 sequence
- /// ids then you'll have 3 tokens.
- ///
- public int token_count;
-
- ///
- /// Number of populated cache cells.
- ///
- public int used_cells;
-
- ///
- /// Maximum contiguous empty slots in the cache.
- ///
- public int max_contiguous;
-
- ///
- /// Index to the start of the max_contiguous slot range. Can be negative
- /// when cache is full.
- ///
- public int max_contiguous_idx;
-
- ///
- /// Information for an individual cell.
- ///
- public NativeLLamaKvCacheViewCell* cells;
-
- ///
- /// The sequences for each cell. There will be n_seq_max items per cell.
- ///
- public LLamaSeqId* cells_sequences;
- }
- #endregion
-}
\ No newline at end of file
diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs
index d31b1bbc8..857f0cfb9 100644
--- a/LLama/Native/LLamaModelQuantizeParams.cs
+++ b/LLama/Native/LLamaModelQuantizeParams.cs
@@ -94,6 +94,11 @@ public bool keep_split
///
public IntPtr tensor_types;
+ ///
+ /// Pointer to vector containing layer indices to prune
+ ///
+ public IntPtr prune_layers;
+
///
/// Create a LLamaModelQuantizeParams with default values
///
diff --git a/LLama/Native/LLamaNativeBatch.cs b/LLama/Native/LLamaNativeBatch.cs
index 41817604a..e65fb5000 100644
--- a/LLama/Native/LLamaNativeBatch.cs
+++ b/LLama/Native/LLamaNativeBatch.cs
@@ -1,7 +1,7 @@
namespace LLama.Native;
///
-/// Input data for llama_decode
+/// Input data for llama_encode/llama_decode
/// A llama_batch object can contain input about one or many sequences
/// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
///
@@ -25,7 +25,7 @@ public unsafe struct LLamaNativeBatch
///
/// the positions of the respective token in the sequence
- /// (if set to NULL, the token position will be tracked automatically by llama_decode)
+ /// (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
///
public LLamaPos* pos;
@@ -41,8 +41,12 @@ public unsafe struct LLamaNativeBatch
public LLamaSeqId** seq_id;
///
- /// if zero, the logits for the respective token will not be output
- /// (if set to NULL, only the logits for last token will be returned)
+ /// if zero, the logits for the respective token will not be output.
+ /// If set to NULL:
+ ///
+ /// If embeddings: all tokens are output
+ /// If not: only the last token is output
+ ///
///
public byte* logits;
}
\ No newline at end of file
diff --git a/LLama/Native/LLamaTimings.cs b/LLama/Native/LLamaTimings.cs
index 25384cca4..24ab925e7 100644
--- a/LLama/Native/LLamaTimings.cs
+++ b/LLama/Native/LLamaTimings.cs
@@ -38,6 +38,11 @@ public struct LLamaPerfContextTimings
/// number of eval calls
///
private int n_eval;
+
+ ///
+ /// number of times a ggml compute graph had been reused
+ ///
+ private int n_reused;
///
/// Timestamp when reset was last called
diff --git a/LLama/Native/LLamaVocabNative.cs b/LLama/Native/LLamaVocabNative.cs
index d4f990a81..05347aa4e 100644
--- a/LLama/Native/LLamaVocabNative.cs
+++ b/LLama/Native/LLamaVocabNative.cs
@@ -94,6 +94,14 @@ internal struct LLamaVocabNative
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern unsafe LLamaToken llama_vocab_pad(LLamaVocabNative* vocab);
+ ///
+ /// mask
+ ///
+ ///
+ ///
+ [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ public static extern unsafe LLamaToken llama_vocab_mask(LLamaVocabNative* vocab);
+
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern unsafe LLamaToken llama_vocab_fim_pre(LLamaVocabNative* vocab);
@@ -119,4 +127,8 @@ internal struct LLamaVocabNative
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
[return: MarshalAs(UnmanagedType.U1)]
public static extern unsafe bool llama_vocab_get_add_eos(LLamaVocabNative* vocab);
+
+ [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ [return: MarshalAs(UnmanagedType.U1)]
+ public static extern unsafe bool llama_vocab_get_add_sep(LLamaVocabNative* vocab);
}
\ No newline at end of file
diff --git a/LLama/Native/LLamaVocabPreType.cs b/LLama/Native/LLamaVocabPreType.cs
deleted file mode 100644
index 48ab5585b..000000000
--- a/LLama/Native/LLamaVocabPreType.cs
+++ /dev/null
@@ -1,47 +0,0 @@
-namespace LLama.Native;
-
-///
-///
-///
-/// llama_vocab_pre_type
-// ReSharper disable InconsistentNaming
-internal enum LLamaVocabPreType
-{
- Default = 0,
-
- LLAMA3 = 1,
- DEEPSEEK_LLM = 2,
- DEEPSEEK_CODER = 3,
- FALCON = 4,
- MPT = 5,
- STARCODER = 6,
- GPT2 = 7,
- REFACT = 8,
- COMMAND_R = 9,
- STABLELM2 = 10,
- QWEN2 = 11,
- OLMO = 12,
- DBRX = 13,
- SMAUG = 14,
- PORO = 15,
- CHATGLM3 = 16,
- CHATGLM4 = 17,
- VIKING = 18,
- JAIS = 19,
- TEKKEN = 20,
- SMOLLM = 21,
- CODESHELL = 22,
- BLOOM = 23,
- GPT3_FINNISH = 24,
- EXAONE = 25,
- CHAMELEON = 26,
- MINERVA = 27,
- DEEPSEEK3_LLM = 28,
- GPT4O = 29,
- SUPERBPE = 30,
- TRILLION = 31,
- BAILINGMOE = 32,
- LLAMA4 = 33,
- PIXTRAL = 34,
-}
-// ReSharper restore InconsistentNaming
\ No newline at end of file
diff --git a/LLama/Native/LLamaVocabType.cs b/LLama/Native/LLamaVocabType.cs
index bd7d704d9..1b5c6b970 100644
--- a/LLama/Native/LLamaVocabType.cs
+++ b/LLama/Native/LLamaVocabType.cs
@@ -35,4 +35,9 @@ public enum LLamaVocabType
/// RWKV tokenizer based on greedy tokenization
///
RWKV = 5,
+
+ ///
+ /// PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
+ ///
+ PLAMO2 = 6
}
\ No newline at end of file
diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs
index 9f6457cd1..9ec996a20 100644
--- a/LLama/Native/Load/NativeLibraryUtils.cs
+++ b/LLama/Native/Load/NativeLibraryUtils.cs
@@ -219,7 +219,9 @@ public static void GetPlatformPathParts(OSPlatform platform, out string os, out
{
if (platform == OSPlatform.Windows)
{
- os = "win-x64";
+ os = System.Runtime.Intrinsics.Arm.ArmBase.Arm64.IsSupported
+ ? "win-arm64"
+ : "win-x64";
fileExtension = ".dll";
libPrefix = "";
return;
diff --git a/LLama/Native/Load/NativeLibraryWithAvx.cs b/LLama/Native/Load/NativeLibraryWithAvx.cs
index e6cbd86f3..3296fac0f 100644
--- a/LLama/Native/Load/NativeLibraryWithAvx.cs
+++ b/LLama/Native/Load/NativeLibraryWithAvx.cs
@@ -50,7 +50,7 @@ public IEnumerable Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaL
private string? GetAvxPath(SystemInfo systemInfo, AvxLevel avxLevel, NativeLogConfig.LLamaLogCallback? logCallback)
{
NativeLibraryUtils.GetPlatformPathParts(systemInfo.OSPlatform, out var os, out var fileExtension, out var libPrefix);
- if (os != "linux-arm64"){
+ if (os != "linux-arm64" && os != "win-arm64"){
var avxStr = NativeLibraryConfig.AvxLevelToString(avxLevel);
if (!string.IsNullOrEmpty(avxStr))
avxStr += "/";
diff --git a/LLama/Native/NativeApi.Memory.cs b/LLama/Native/NativeApi.Memory.cs
new file mode 100644
index 000000000..24a406ab2
--- /dev/null
+++ b/LLama/Native/NativeApi.Memory.cs
@@ -0,0 +1,104 @@
+using System;
+
+namespace LLama.Native;
+
+public static partial class NativeApi
+{
+ ///
+ /// Clear the memory contents. If data == true, the data buffers will also be cleared together with the metadata
+ ///
+ ///
+ ///
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ public static extern void llama_memory_clear(IntPtr /* llama_memory_t */ mem, [MarshalAs(UnmanagedType.U1)] bool data);
+
+ ///
+ /// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+ ///
+ ///
+ ///
+ ///
+ ///
+ /// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ [return: MarshalAs(UnmanagedType.U1)]
+ public static extern bool llama_memory_seq_rm(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq, LLamaPos p0, LLamaPos p1);
+
+ ///
+ /// Copy all tokens that belong to the specified sequence to another sequence
+ /// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
+ ///
+ ///
+ ///
+ ///
+ /// p0 < 0 : [0, p1]
+ /// p1 < 0 : [p0, inf)
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern void llama_memory_seq_cp(IntPtr /* llama_memory_t */ mem, LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1);
+
+ ///
+ /// Removes all tokens that do not belong to the specified sequence
+ ///
+ ///
+ ///
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern void llama_memory_seq_keep(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq);
+
+ ///
+ /// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+ ///
+ ///
+ ///
+ /// p0 < 0 : [0, p1]
+ /// p1 < 0 : [p0, inf)
+ ///
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern void llama_memory_seq_add(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta);
+
+ ///
+ /// Integer division of the positions by factor of `d > 1`
+ ///
+ /// p0 < 0 : [0, p1]
+ ///
+ /// p1 < 0 : [p0, inf)
+ ///
+ ///
+ ///
+ /// p0 < 0 : [0, p1]
+ /// p1 < 0 : [p0, inf)
+ ///
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern void llama_memory_seq_div(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d);
+
+ ///
+ /// Returns the smallest position present in the memory for the specified sequence.
+ /// This is typically non-zero only for SWA caches.
+ /// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory.
+ /// Return -1 if the sequence is empty.
+ ///
+ ///
+ ///
+ ///
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern LLamaPos llama_memory_seq_pos_min(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq);
+
+ ///
+ /// Returns the largest position present in the memory for the specified sequence.
+ /// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory.
+ /// Return -1 if the sequence is empty.
+ ///
+ ///
+ ///
+ ///
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern LLamaPos llama_memory_seq_pos_max(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq);
+
+ ///
+ /// Check if the memory supports shifting
+ ///
+ ///
+ ///
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ [return: MarshalAs(UnmanagedType.U1)]
+ internal static extern bool llama_memory_can_shift(IntPtr /* llama_memory_t */ mem);
+}
\ No newline at end of file
diff --git a/LLama/Native/NativeApi.Training.cs b/LLama/Native/NativeApi.Training.cs
new file mode 100644
index 000000000..ea1370b57
--- /dev/null
+++ b/LLama/Native/NativeApi.Training.cs
@@ -0,0 +1,35 @@
+namespace LLama.Native;
+
+public static partial class NativeApi
+{
+ /////
+ ///// function that returns whether or not a given tensor contains trainable parameters
+ /////
+ /////
+ /////
+ /////
+ //[return: MarshalAs(UnmanagedType.U1)]
+ //private unsafe delegate bool llama_opt_param_filter(void* ggml_tensor, void* userdata);
+
+ //private unsafe struct llama_opt_params
+ //{
+ // uint n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
+
+ // llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
+ // void* param_filter_ud; // userdata for determining which tensors contain trainable parameters
+
+ // ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+
+ // void* get_opt_pars_ud; // userdata for calculating optimizer parameters
+ //};
+
+ //internal static extern void llama_opt_init(SafeLLamaContextHandle ctx, SafeLLamaContextHandle model, llama_opt_params @params);
+
+ //internal static extern void llama_opt_epoch(SafeLLamaContextHandle ct,
+ // ggml_opt_dataset_t dataset,
+ // ggml_opt_result_t result_train,
+ // ggml_opt_result_t result_eval,
+ // int64_t idata_split,
+ // ggml_opt_epoch_callback callback_train,
+ // ggml_opt_epoch_callback callback_eval);
+}
\ No newline at end of file
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index 87cf02c78..db9e928bd 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -32,6 +32,13 @@ public static void llama_empty_call()
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern long llama_max_devices();
+ ///
+ /// Maximum number of parallel sequences
+ ///
+ ///
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ public static extern long llama_max_parallel_sequences();
+
///
/// Check if memory mapping is supported
///
@@ -125,7 +132,7 @@ public static void llama_empty_call()
public static extern void llama_set_causal_attn(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool causalAttn);
///
- /// Set whether the model is in embeddings mode or not.
+ /// Set whether the context outputs embeddings or not
///
///
/// If true, embeddings will be returned but logits will not
@@ -237,7 +244,7 @@ public static int llama_token_to_piece(SafeLlamaModelHandle.Vocabulary vocab, LL
/// add_special Allow to add BOS and EOS tokens if model is configured to do so.
/// Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.
/// Returns the number of tokens on success, no more than n_max_tokens.
- /// Returns a negative number on failure - the number of tokens that would have been returned
+ /// Returns a negative number on failure - the number of tokens that would have been returned. Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
///
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
internal static extern unsafe int llama_tokenize(LLamaVocabNative* model, byte* text, int text_len, LLamaToken* tokens, int n_max_tokens, [MarshalAs(UnmanagedType.U1)] bool add_special, [MarshalAs(UnmanagedType.U1)] bool parse_special);
@@ -266,111 +273,6 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
NativeLogConfig.llama_log_set(logCallback);
}
- ///
- /// Returns the number of tokens in the KV cache (slow, use only for debug)
- /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
- ///
- ///
- ///
- [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- internal static extern int llama_kv_self_n_tokens(SafeLLamaContextHandle ctx);
-
- ///
- /// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
- ///
- ///
- ///
- [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- internal static extern int llama_kv_self_used_cells(SafeLLamaContextHandle ctx);
-
- ///
- /// Clear the KV cache. Both cell info is erased and KV data is zeroed
- ///
- ///
- [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- internal static extern void llama_kv_self_clear(SafeLLamaContextHandle ctx);
-
- [Obsolete("Use `llama_kv_self_clear` instead")]
- ///
- /// Clear the KV cache. Both cell info is erased and KV data is zeroed
- ///
- ///
- [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- internal static extern void llama_kv_cache_clear(SafeLLamaContextHandle ctx);
-
- ///
- /// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
- ///
- ///
- ///
- ///
- ///
- /// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
- [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- [return: MarshalAs(UnmanagedType.U1)]
- public static extern bool llama_kv_self_seq_rm(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1);
-
- ///
- /// Copy all tokens that belong to the specified sequence to another sequence
- /// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
- ///
- ///
- ///
- ///
- ///
- ///
- [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- internal static extern void llama_kv_self_seq_cp(SafeLLamaContextHandle ctx, LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1);
-
- ///
- /// Removes all tokens that do not belong to the specified sequence
- ///
- ///
- ///
- [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- internal static extern void llama_kv_self_seq_keep(SafeLLamaContextHandle ctx, LLamaSeqId seq);
-
- ///
- /// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
- /// If the KV cache is RoPEd, the KV data is updated accordingly:
- /// - lazily on next llama_decode()
- /// - explicitly with llama_kv_self_update()
- ///
- ///
- ///
- ///
- ///
- ///
- [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- internal static extern void llama_kv_self_seq_add(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta);
-
- ///
- /// Integer division of the positions by factor of `d > 1`
- /// If the KV cache is RoPEd, the KV data is updated accordingly:
- /// - lazily on next llama_decode()
- /// - explicitly with llama_kv_self_update()
- ///
- /// p0 < 0 : [0, p1]
- ///
- /// p1 < 0 : [p0, inf)
- ///
- ///
- ///
- ///
- ///
- ///
- [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- internal static extern void llama_kv_self_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d);
-
- ///
- /// Returns the largest position present in the KV cache for the specified sequence
- ///
- ///
- ///
- ///
- [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- internal static extern LLamaPos llama_kv_self_seq_pos_max(SafeLLamaContextHandle ctx, LLamaSeqId seq);
-
///
/// Allocates a batch of tokens on the heap
/// Each token can be assigned up to n_seq_max sequence ids
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index 467dd98e7..e26619b26 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -147,7 +147,9 @@ static SafeLLamaContextHandle()
///
///
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ // ReSharper disable InconsistentNaming
private static extern unsafe void llama_set_abort_callback(SafeLLamaContextHandle ctx, GgmlAbortCallback abort_callback, void* abort_callback_data);
+ // ReSharper restore InconsistentNaming
///
/// If this returns true computation is cancelled
@@ -157,20 +159,27 @@ static SafeLLamaContextHandle()
private unsafe delegate bool GgmlAbortCallback(void* data);
///
+ /// Process a batch of tokens.
+ /// Requires the context to have a memory.
+ /// For encode-decoder contexts, processes the batch using the decoder.
+ /// Positive return values does not mean a fatal error, but rather a warning.
+ /// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
+ /// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
+ /// Upon other return values, the memory state is restored to the state before this call
+ /// 0 - success
+ /// 1 - could not find a memory slot for the batch (try reducing the size of the batch or increase the context)
+ /// 2 - aborted (processed ubatches will remain in the context's memory)
+ /// -1 - invalid input batch
+ /// < -1 - fatal error (processed ubatches will remain in the context's memory)
///
- ///
- ///
- /// Positive return values does not mean a fatal error, but rather a warning:
- /// - 0: success
- /// - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
- /// - < 0: error
- ///
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern int llama_decode(SafeLLamaContextHandle ctx, LLamaNativeBatch batch);
///
- /// Processes a batch of tokens with the encoder part of the encoder-decoder model. Stores the encoder output
- /// internally for later use by the decoder cross-attention layers.
+ /// Process a batch of tokens.
+ /// In contrast to llama_decode() - this call does not use KV cache.
+ /// For encode-decoder contexts, processes the batch using the encoder.
+ /// Can store the encoder output internally for later use by the decoder's cross-attention layers.
///
///
///
@@ -186,7 +195,9 @@ static SafeLLamaContextHandle()
/// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
///
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ // ReSharper disable InconsistentNaming
private static extern void llama_set_n_threads(SafeLLamaContextHandle ctx, int n_threads, int n_threads_batch);
+ // ReSharper restore InconsistentNaming
///
/// Get the number of threads used for generation of a single token.
@@ -250,7 +261,7 @@ static SafeLLamaContextHandle()
private static extern uint llama_n_ubatch(SafeLLamaContextHandle ctx);
///
- /// Returns the **actual** size in bytes of the state (logits, embedding and kv_cache).
+ /// Returns the **actual** size in bytes of the state (logits, embedding and memory).
/// Only use when saving the state, not when restoring it, otherwise the size may be too small.
///
///
@@ -280,13 +291,13 @@ static SafeLLamaContextHandle()
private static extern unsafe nuint llama_state_set_data(SafeLLamaContextHandle ctx, byte* src, nuint size);
///
- /// Get the exact size needed to copy the KV cache of a single sequence
+ /// Get the exact size needed to copy the state of a single sequence
///
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern nuint llama_state_seq_get_size(SafeLLamaContextHandle ctx, LLamaSeqId seqId);
///
- /// Copy the KV cache of a single sequence into the specified buffer
+ /// Copy the state of a single sequence into the specified buffer
///
///
///
@@ -310,31 +321,6 @@ static SafeLLamaContextHandle()
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern unsafe nuint llama_state_seq_set_data(SafeLLamaContextHandle ctx, byte* src, nuint size, LLamaSeqId destSeqId);
- ///
- /// Defragment the KV cache. This will be applied:
- /// - lazily on next llama_decode()
- /// - explicitly with llama_kv_self_update()
- ///
- ///
- ///
- [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- private static extern void llama_kv_self_defrag(SafeLLamaContextHandle ctx);
-
- ///
- /// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
- ///
- ///
- [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- private static extern void llama_kv_self_update(SafeLLamaContextHandle ctx);
-
- ///
- /// Check if the context supports KV cache shifting
- ///
- ///
- ///
- [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- private static extern bool llama_kv_self_can_shift(SafeLLamaContextHandle ctx);
-
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern LLamaPerfContextTimings llama_perf_context(SafeLLamaContextHandle ctx);
@@ -372,7 +358,7 @@ static SafeLLamaContextHandle()
///
/// Get the embeddings for a sequence id.
/// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
- /// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+ /// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
/// otherwise: float[n_embd] (1-dimensional)
///
/// A pointer to the first float in an embedding, length = ctx.EmbeddingSize
@@ -388,7 +374,7 @@ static SafeLLamaContextHandle()
private static extern unsafe float* llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i);
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- private static extern LLamaKvCacheNative llama_get_kv_self(SafeLLamaContextHandle ctx);
+ private static extern IntPtr llama_get_memory(SafeLLamaContextHandle ctx);
///
/// Set whether the model is in warmup mode or not
@@ -580,7 +566,7 @@ public void Synchronize()
/// internally for later use by the decoder cross-attention layers.
///
///
- /// 0 = success < 0 = error (the KV cache state is restored to the state before this call)
+ /// 0 = success < 0 = error (the memory state is restored to the state before this call)
public DecodeResult Encode(LLamaBatch batch)
{
if (batch.TokenCount == 0)
@@ -592,13 +578,19 @@ public DecodeResult Encode(LLamaBatch batch)
}
///
+ /// Process a batch of tokens.
+ /// Requires the context to have a memory.
+ /// For encode-decoder contexts, processes the batch using the decoder.
+ /// Positive return values does not mean a fatal error, but rather a warning.
+ /// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
+ /// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
+ /// Upon other return values, the memory state is restored to the state before this call
+ /// 0 - success
+ /// 1 - could not find a memory slot for the batch (try reducing the size of the batch or increase the context)
+ /// 2 - aborted (processed ubatches will remain in the context's memory)
+ /// -1 - invalid input batch
+ /// < -1 - fatal error (processed ubatches will remain in the context's memory)
///
- ///
- /// Positive return values does not mean a fatal error, but rather a warning:
- /// - 0: success
- /// - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
- /// - < 0: error (the KV cache state is restored to the state before this call)
- ///
public DecodeResult Decode(LLamaBatch batch)
{
if (batch.TokenCount == 0)
@@ -617,6 +609,7 @@ public DecodeResult Decode(LLamaBatch batch)
///
///
/// A tuple, containing the decode result and the number of tokens that have not been decoded yet.
+ // ReSharper disable once InconsistentNaming
internal (DecodeResult, int) Decode(List tokens, LLamaSeqId id, LLamaBatch batch, ref int n_past)
{
if (tokens.Count == 0)
@@ -645,15 +638,21 @@ public DecodeResult Decode(LLamaBatch batch)
return (DecodeResult.Ok, 0);
}
-
+
///
+ /// Process a batch of tokens.
+ /// Requires the context to have a memory.
+ /// For encode-decoder contexts, processes the batch using the decoder.
+ /// Positive return values does not mean a fatal error, but rather a warning.
+ /// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
+ /// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
+ /// Upon other return values, the memory state is restored to the state before this call
+ /// 0 - success
+ /// 1 - could not find a memory slot for the batch (try reducing the size of the batch or increase the context)
+ /// 2 - aborted (processed ubatches will remain in the context's memory)
+ /// -1 - invalid input batch
+ /// < -1 - fatal error (processed ubatches will remain in the context's memory)
///
- ///
- /// Positive return values does not mean a fatal error, but rather a warning:
- /// - 0: success
- /// - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
- /// - < 0: error
- ///
public DecodeResult Decode(LLamaBatchEmbeddings batch)
{
if (batch.EmbeddingsCount == 0)
@@ -675,7 +674,7 @@ public nuint GetStateSize()
}
///
- /// Get the size of the KV cache for a single sequence ID, when saved as bytes
+ /// Get the size of the memory state for a single sequence ID, when saved as bytes
///
///
///
@@ -759,66 +758,20 @@ public void ResetTimings()
}
#endregion
- #region KV Cache Management
- ///
- /// Check if the context supports KV cache shifting
- ///
- public bool KvCacheCanShift => llama_kv_self_can_shift(this);
+ #region Memory Management
///
- /// Apply KV cache updates (such as K-shifts, defragmentation, etc.)
+ /// Check if the context supports memory shifting
///
- public void KvCacheUpdate()
- {
- llama_kv_self_update(this);
- }
+ public bool MemoryCanShift => NativeApi.llama_memory_can_shift(llama_get_memory(this));
///
- /// Defragment the KV cache. This will be applied:
- /// - lazily on next llama_decode()
- /// - explicitly with llama_kv_self_update()
+ /// Clear the memory
///
- ///
- public void KvCacheDefrag()
- {
- llama_kv_self_defrag(this);
- }
-
- ///
- /// Get a new KV cache view that can be used to debug the KV cache
- ///
- ///
- ///
- public LLamaKvCacheViewSafeHandle KvCacheGetDebugView(int maxSequences = 4)
- {
- return LLamaKvCacheViewSafeHandle.Allocate(this, maxSequences);
- }
-
- ///
- /// Count the number of used cells in the KV cache (i.e. have at least one sequence assigned to them)
- ///
- ///
- public int KvCacheCountCells()
+ /// If true, the data buffers will also be cleared together with the metadata
+ public void MemoryClear(bool data = true)
{
- return NativeApi.llama_kv_self_used_cells(this);
- }
-
- ///
- /// Returns the number of tokens in the KV cache (slow, use only for debug)
- /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
- ///
- ///
- public int KvCacheCountTokens()
- {
- return NativeApi.llama_kv_self_n_tokens(this);
- }
-
- ///
- /// Clear the KV cache - both cell info is erased and KV data is zeroed
- ///
- public void KvCacheClear()
- {
- NativeApi.llama_kv_self_clear(this);
+ NativeApi.llama_memory_clear(llama_get_memory(this), data);
}
///
@@ -827,54 +780,52 @@ public void KvCacheClear()
///
///
///
- public void KvCacheRemove(LLamaSeqId seq, LLamaPos p0, LLamaPos p1)
+ public void MemorySequenceRemove(LLamaSeqId seq, LLamaPos p0, LLamaPos p1)
{
- NativeApi.llama_kv_self_seq_rm(this, seq, p0, p1);
+ NativeApi.llama_memory_seq_rm(llama_get_memory(this), seq, p0, p1);
}
///
/// Copy all tokens that belong to the specified sequence to another sequence. Note that
- /// this does not allocate extra KV cache memory - it simply assigns the tokens to the
+ /// this does not allocate extra memory - it simply assigns the tokens to the
/// new sequence
///
///
///
///
///
- public void KvCacheSequenceCopy(LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1)
+ public void MemorySequenceCopy(LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1)
{
- NativeApi.llama_kv_self_seq_cp(this, src, dest, p0, p1);
+ NativeApi.llama_memory_seq_cp(llama_get_memory(this), src, dest, p0, p1);
}
///
/// Removes all tokens that do not belong to the specified sequence
///
///
- public void KvCacheSequenceKeep(LLamaSeqId seq)
+ public void MemorySequenceKeep(LLamaSeqId seq)
{
- NativeApi.llama_kv_self_seq_keep(this, seq);
+ NativeApi.llama_memory_seq_keep(llama_get_memory(this), seq);
}
///
/// Adds relative position "delta" to all tokens that belong to the specified sequence
- /// and have positions in [p0, p1. If the KV cache is RoPEd, the KV data is updated
- /// accordingly
+ /// and have positions in [p0, p1)
///
///
///
///
///
- public void KvCacheSequenceAdd(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta)
+ public void MemorySequenceAdd(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta)
{
- if (!KvCacheCanShift)
- throw new InvalidOperationException("Cannot shift KV cache (KvCacheCanShift=False)");
+ if (!MemoryCanShift)
+ throw new InvalidOperationException("Cannot shift memory (MemoryCanShift == false)");
- NativeApi.llama_kv_self_seq_add(this, seq, p0, p1, delta);
+ NativeApi.llama_memory_seq_add(llama_get_memory(this), seq, p0, p1, delta);
}
///
- /// Integer division of the positions by factor of `d > 1`.
- /// If the KV cache is RoPEd, the KV data is updated accordingly.
+ /// Integer division of the positions by factor of `d > 1`.
/// p0 < 0 : [0, p1]
/// p1 < 0 : [p0, inf)
///
@@ -882,22 +833,32 @@ public void KvCacheSequenceAdd(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int del
///
///
///
- public void KvCacheSequenceDivide(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int divisor)
+ public void MemorySequenceDivide(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int divisor)
{
- if (!KvCacheCanShift)
- throw new InvalidOperationException("Cannot shift KV cache (KvCacheCanShift=False)");
+ if (!MemoryCanShift)
+ throw new InvalidOperationException("Cannot shift memory (MemoryCanShift == false)");
- NativeApi.llama_kv_self_seq_div(this, seq, p0, p1, divisor);
+ NativeApi.llama_memory_seq_add(llama_get_memory(this), seq, p0, p1, divisor);
+ }
+
+ ///
+ /// Returns the smallest position present in memory for the specified sequence
+ ///
+ ///
+ ///
+ public LLamaPos MemorySequenceMinPosition(LLamaSeqId seq)
+ {
+ return NativeApi.llama_memory_seq_pos_min(llama_get_memory(this), seq);
}
///
- /// Returns the largest position present in the KV cache for the specified sequence
+ /// Returns the largest position present in memory for the specified sequence
///
///
///
- public LLamaPos KvCacheMaxPosition(LLamaSeqId seq)
+ public LLamaPos MemorySequenceMaxPosition(LLamaSeqId seq)
{
- return NativeApi.llama_kv_self_seq_pos_max(this, seq);
+ return NativeApi.llama_memory_seq_pos_max(llama_get_memory(this), seq);
}
#endregion
}
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index 16336f706..d335a1209 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -2,6 +2,7 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
+using System.Runtime.CompilerServices;
using System.Text;
using CommunityToolkit.HighPerformance.Buffers;
using LLama.Exceptions;
@@ -58,7 +59,12 @@ public sealed class SafeLlamaModelHandle
///
/// Get the number of KV heads in this model
///
- public int KVHeadCount => llama_model_n_head(this);
+ public int KVHeadCount => llama_model_n_head_kv(this);
+
+ ///
+ /// Get the number of SWA in this model
+ ///
+ public int SWACount => llama_model_n_swa(this);
///
/// Returns true if the model contains an encoder that requires llama_encode() call
@@ -140,6 +146,20 @@ public static SafeLlamaModelHandle LoadFromFile(string modelPath, LLamaModelPara
return handle;
}
+ ///
+ /// Save this model to a file
+ ///
+ ///
+ public void SaveToFile(string modelPath)
+ {
+ // If the file already exists, delete it. llama.cpp would overwrite, but doing this in C# has better errors in
+ // case of inaccessible/read-only files.
+ if (File.Exists(modelPath))
+ File.Delete(modelPath);
+
+ llama_model_save_to_file(this, modelPath);
+ }
+
#region native API
static SafeLlamaModelHandle()
{
@@ -324,6 +344,14 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern int llama_model_n_head_kv(SafeLlamaModelHandle model);
+ ///
+ ///
+ ///
+ ///
+ ///
+ [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ private static extern int llama_model_n_swa(SafeLlamaModelHandle model);
+
///
/// Get a string describing the model type
///
@@ -398,6 +426,25 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern unsafe LLamaVocabNative* llama_model_get_vocab(SafeLlamaModelHandle model);
+
+ [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ private static extern void llama_model_save_to_file(SafeLlamaModelHandle model, string path);
+
+ ///
+ /// Returns the number of classifier outputs (only valid for classifier models)
+ /// Undefined behavior for non-classifier models
+ ///
+ ///
+ ///
+ private static extern uint llama_model_n_cls_out(SafeLlamaModelHandle model);
+
+ ///
+ /// Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
+ ///
+ ///
+ ///
+ ///
+ private static extern string? llama_model_cls_label(SafeLlamaModelHandle model, uint i);
#endregion
#region LoRA
@@ -771,6 +818,20 @@ public LLamaToken? Pad
}
}
+ ///
+ /// Get the masking token for this model
+ ///
+ public LLamaToken? Mask
+ {
+ get
+ {
+ unsafe
+ {
+ return Normalize(LLamaVocabNative.llama_vocab_mask(VocabNative));
+ }
+ }
+ }
+
///
/// Get the sentence separator token for this model
///
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
index aeef403eb..db2693270 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
@@ -42,6 +42,13 @@
+
+
+
+
+
+
+
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec
deleted file mode 100644
index 6abd16ccc..000000000
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec
+++ /dev/null
@@ -1,34 +0,0 @@
-
-
-
- LLamaSharp.Backend.Cuda11.Linux
- $version$
- LLamaSharp.Backend.Cuda11.Linux
- llama.cpp Authors
- false
- MIT
- icon512.png
- https://github.com/SciSharp/LLamaSharp
- LLamaSharp.Backend.Cuda11.Linux contains the Linux binaries for LLamaSharp with Cuda11 support.
-
- Copyright 2023 The llama.cpp Authors. All rights reserved.
- LLamaSharp LLama LLM GPT AI ChatBot SciSharp
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec
deleted file mode 100644
index a412e2e6f..000000000
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec
+++ /dev/null
@@ -1,34 +0,0 @@
-
-
-
- LLamaSharp.Backend.Cuda11.Windows
- $version$
- LLamaSharp.Backend.Cuda11.Windows
- llama.cpp Authors
- false
- MIT
- icon512.png
- https://github.com/SciSharp/LLamaSharp
- LLamaSharp.Backend.Cuda11.Windows contains the Windows binaries for LLamaSharp with Cuda11 support.
-
- Copyright 2023 The llama.cpp Authors. All rights reserved.
- LLamaSharp LLama LLM GPT AI ChatBot SciSharp
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec
deleted file mode 100644
index 5ac473914..000000000
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec
+++ /dev/null
@@ -1,28 +0,0 @@
-
-
-
- LLamaSharp.Backend.Cuda11
- $version$
- LLamaSharp.Backend.Cuda11, the backend for LLamaSharp
- llama.cpp Authors
- false
- MIT
- icon512.png
- https://github.com/SciSharp/LLamaSharp
- LLamaSharp.Backend.Cuda11 is a backend for LLamaSharp to use with Cuda11.
-
- Copyright 2023 The llama.cpp Authors. All rights reserved.
- LLamaSharp LLama LLM GPT AI ChatBot SciSharp
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec
index 687283221..b372f1e1d 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec
@@ -22,12 +22,12 @@
-
-
-
+
+
+
-
-
+
+
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec
index 1fd01edb9..38c003236 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec
@@ -22,12 +22,12 @@
-
-
-
+
+
+
-
-
+
+
diff --git a/llama.cpp b/llama.cpp
index ceda28ef8..11dd5a44e 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit ceda28ef8e310a8dee60bf275077a3eedae8e36c
+Subproject commit 11dd5a44eb180e1d69fac24d3852b5222d66fb7f