diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index ccc013653..5dff9b7b0 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -160,12 +160,16 @@ jobs:
         include:
           - build: 'noavx'
             defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
+            arch: 'x64'
           - build: 'avx2'
             defines: ''
+            arch: 'x64'
           - build: 'avx'
             defines: '-DGGML_AVX2=OFF'
+            arch: 'x64'
           - build: 'avx512'
             defines: '-DGGML_AVX512=ON -DGGML_AVX512_VBMI=ON -DGGML_AVX512_VNNI=ON'
+            arch: 'x64'
     runs-on: windows-latest
     steps:
       - uses: actions/checkout@v4
@@ -187,31 +191,89 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           path: .\build\bin\Release\llama.dll
-          name: llama-bin-win-${{ matrix.build }}-x64.dll
+          name: llama-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
           if-no-files-found: error
       - name: Upload artifacts (ggml)
         uses: actions/upload-artifact@v4
         with:
           path: .\build\bin\Release\ggml.dll
-          name: ggml-bin-win-${{ matrix.build }}-x64.dll
+          name: ggml-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
           if-no-files-found: error
       - name: Upload artifacts (ggml-base)
         uses: actions/upload-artifact@v4
         with:
           path: .\build\bin\Release\ggml-base.dll
-          name: ggml-base-bin-win-${{ matrix.build }}-x64.dll
+          name: ggml-base-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
           if-no-files-found: error
       - name: Upload artifacts (ggml-cpu)
         uses: actions/upload-artifact@v4
         with:
           path: .\build\bin\Release\ggml-cpu.dll
-          name: ggml-cpu-bin-win-${{ matrix.build }}-x64.dll
+          name: ggml-cpu-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
           if-no-files-found: error
       - name: Upload artifacts (mtmd)
         uses: actions/upload-artifact@v4
         with:
-          path: .\build\bin\Release\mtmd.dll
-          name: mtmd-bin-win-${{ matrix.build }}-x64.dll
+          path: .\build\bin\Release\llava_shared.dll
+          name: llava-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+          if-no-files-found: error
+
+  compile-windows-arm64:
+    name: Compile (Windows ARM64)
+    strategy:
+      fail-fast: true
+      matrix:
+        include:
+          - build: 'arm64'
+            defines: '-DCMAKE_GENERATOR_PLATFORM=ARM64 -DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
+            arch: 'arm64'
+    runs-on: windows-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          repository: ggerganov/llama.cpp
+          fetch-depth: 0
+          ref: '${{ github.event.inputs.llama_cpp_commit }}'
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. ${{ env.COMMON_DEFINE }} ${{ matrix.defines }}
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          tree /f
+
+      - name: Upload artifacts (llama)
+        uses: actions/upload-artifact@v4
+        with:
+          path: .\build\bin\Release\llama.dll
+          name: llama-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+          if-no-files-found: error
+      - name: Upload artifacts (ggml)
+        uses: actions/upload-artifact@v4
+        with:
+          path: .\build\bin\Release\ggml.dll
+          name: ggml-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+          if-no-files-found: error
+      - name: Upload artifacts (ggml-base)
+        uses: actions/upload-artifact@v4
+        with:
+          path: .\build\bin\Release\ggml-base.dll
+          name: ggml-base-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+          if-no-files-found: error
+      - name: Upload artifacts (ggml-cpu)
+        uses: actions/upload-artifact@v4
+        with:
+          path: .\build\bin\Release\ggml-cpu.dll
+          name: ggml-cpu-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+          if-no-files-found: error
+
+      - name: Upload artifacts (llava)
+        uses: actions/upload-artifact@v4
+        with:
+          path: .\build\bin\Release\llava_shared.dll
+          name: llava-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
           if-no-files-found: error
           
   compile-vulkan:
@@ -603,6 +665,7 @@ jobs:
       "compile-linux",
       "compile-musl",
       "compile-windows",
+      "compile-windows-arm64",
       "compile-vulkan",
       "compile-cublas",
       "compile-macos",
@@ -617,7 +680,7 @@ jobs:
       - name: Rearrange Files
         run: |
           # Make all directories at once
-          mkdir --parents deps/{noavx,avx,avx2,avx512,linux-arm64,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu12.4.0,vulkan,android-arm64-v8a,android-x86,android-x86_64}
+          mkdir --parents deps/{noavx,avx,avx2,avx512,linux-arm64,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu11.7.1,cu12.2.0,vulkan,android-arm64-v8a,android-x86,android-x86_64,win-arm64}
 
           # Linux
           cp artifacts/ggml-bin-linux-noavx-x64.so/libggml.so           deps/noavx/libggml.so
@@ -701,6 +764,13 @@ jobs:
           cp artifacts/llama-bin-win-avx512-x64.dll/llama.dll         deps/avx512/llama.dll
           cp artifacts/mtmd-bin-win-avx512-x64.dll/mtmd.dll           deps/avx512/mtmd.dll
 
+          # Windows ARM64
+          cp artifacts/ggml-bin-win-arm64-arm64.dll/ggml.dll           deps/win-arm64/ggml.dll
+          cp artifacts/ggml-base-bin-win-arm64-arm64.dll/ggml-base.dll deps/win-arm64/ggml-base.dll
+          cp artifacts/ggml-cpu-bin-win-arm64-arm64.dll/ggml-cpu.dll   deps/win-arm64/ggml-cpu.dll
+          cp artifacts/llama-bin-win-arm64-arm64.dll/llama.dll         deps/win-arm64/llama.dll
+          cp artifacts/llava-bin-win-arm64-arm64.dll/llava_shared.dll  deps/win-arm64/llava_shared.dll
+
           # MacOS
           cp artifacts/ggml-bin-osx-arm64.dylib/libggml.dylib             deps/osx-arm64/libggml.dylib
           cp artifacts/ggml-base-bin-osx-arm64.dylib/libggml-base.dylib   deps/osx-arm64/libggml-base.dylib
diff --git a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
index 33b399ec9..084821f0b 100644
--- a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
+++ b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
@@ -119,7 +119,7 @@ public void GlobalCleanup()
         {
             if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
             {
-                Executor.Context.NativeHandle.KvCacheClear();
+                Executor.Context.NativeHandle.MemoryClear();
             }
         }
 
diff --git a/LLama.Examples/Examples/BatchedExecutorSimple.cs b/LLama.Examples/Examples/BatchedExecutorSimple.cs
index 5e532ff6a..9f8e6b6c7 100644
--- a/LLama.Examples/Examples/BatchedExecutorSimple.cs
+++ b/LLama.Examples/Examples/BatchedExecutorSimple.cs
@@ -97,8 +97,8 @@ await AnsiConsole.Live(table).StartAsync(async ctx =>
 
                 // A generic error, this is fatal and the batch can no longer be used. This should never occur and generally indicates
                 // a bug in LLamaSharp, llama.cpp or a hardware error.
-                if (decodeResult == DecodeResult.Error)
-                    throw new Exception("Unknown error occurred while inferring.");
+                if (decodeResult != DecodeResult.Ok)
+                    throw new Exception($"Error occurred while inferring: {decodeResult}");
                 
                 // After inference all of the conversations must be sampled before running inference again.
                 foreach (var conversationData in conversations)
diff --git a/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs b/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
index dc2dee06e..8cbf58dcd 100644
--- a/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
+++ b/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
@@ -79,7 +79,7 @@ public static async Task Run()
                     // When the prompt contains images we clear KV_CACHE to restart conversation
                     // See:
                     // https://github.com/ggerganov/llama.cpp/discussions/3620
-                    ex.Context.NativeHandle.KvCacheRemove( LLamaSeqId.Zero, -1, -1 );
+                    ex.Context.NativeHandle.MemorySequenceRemove( LLamaSeqId.Zero, -1, -1 );
 
                     int index = 0;
                     foreach (var path in imagePathsWithCurlyBraces)
diff --git a/LLama.Unittest/LLavaWeightsTests.cs b/LLama.Unittest/LLavaWeightsTests.cs
deleted file mode 100644
index 25a5f996a..000000000
--- a/LLama.Unittest/LLavaWeightsTests.cs
+++ /dev/null
@@ -1,54 +0,0 @@
-using LLama.Common;
-using LLama.Native;
-
-namespace LLama.Unittest
-{
-    // Test the same things as llama model + image embedings
-    //
-    public sealed class LLavaWeightTests
-        : IDisposable
-    {
-        private readonly LLamaWeights _llamaWeights;
-        private readonly LLavaWeights _lLavaWeights;
-        private readonly LLamaContext _context;
-        
-        public LLavaWeightTests()
-        {
-            var @params = new ModelParams(Constants.LLavaModelPath)
-            {
-                // Llava models requires big context
-                ContextSize = 4096,
-                GpuLayerCount = Constants.CIGpuLayerCount,                
-            };
-            _llamaWeights = LLamaWeights.LoadFromFile(@params);
-            _lLavaWeights = LLavaWeights.LoadFromFile(Constants.LLavaMmpPath);
-            
-            _context = _llamaWeights.CreateContext(@params);
-            
-        }
-
-        public void Dispose()
-        {
-            _llamaWeights.Dispose();
-            _lLavaWeights.Dispose();
-        }
-      
-        [Fact,Trait("Category", "NoCI")]
-        public void EmbedImageAsFileName()
-        {
-            int n_past = 0;
-            SafeLlavaImageEmbedHandle emb = _lLavaWeights.CreateImageEmbeddings(_context, Constants.LLavaImage);
-            Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) );
-        }        
-        
-        [Fact,Trait("Category", "NoCI")]
-        public void EmbedImageAsBinary()
-        {
-            int n_past = 0;
-            byte[] image = System.IO.File.ReadAllBytes(Constants.LLavaImage);
-            SafeLlavaImageEmbedHandle emb = _lLavaWeights.CreateImageEmbeddings(_context, image);
-            Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) );
-        }      
-        
-    }
-}
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
index 9824c0922..c453aeddf 100644
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -110,6 +110,15 @@ public class ModelOptions
         /// <inheritdoc />
         public bool VocabOnly { get; set; }
 
+        /// <inheritdoc />
+        public bool? OpOffload { get; set; }
+
+        /// <inheritdoc />
+        public bool? SwaFull { get; set; }
+
+        /// <inheritdoc />
+        public bool? KVUnified { get; set; }
+
         /// <inheritdoc />
         public float? DefragThreshold { get; set; }
 
diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
index cd18d5dbf..f80759c8a 100644
--- a/LLama/Abstractions/IContextParams.cs
+++ b/LLama/Abstractions/IContextParams.cs
@@ -109,8 +109,7 @@ public interface IContextParams
     bool FlashAttention { get; }
 
     /// <summary>
-    /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt; 0 to disable (default)
-    /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to <see langword="null"/> or &lt; 0 to disable (default)
+    /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt;= 0 to disable (default)
     /// </summary>
     float? DefragThreshold { get; }
 
@@ -123,4 +122,25 @@ public interface IContextParams
     /// Attention type to use for embeddings
     /// </summary>
     LLamaAttentionType AttentionType { get; }
+
+    /// <summary>
+    /// Offload host tensor operations to device
+    /// </summary>
+    bool? OpOffload { get; }
+
+    /// <summary>
+    /// use a unified buffer across the input sequences when computing the attention.
+    /// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
+    /// <br />
+    /// ref: <a href="https://github.com/ggml-org/llama.cpp/pull/14363">https://github.com/ggml-org/llama.cpp/pull/14363</a>
+    /// </summary>
+    bool? KVUnified { get; }
+
+    /// <summary>
+    /// Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+    /// </summary>
+    /// <remarks>Setting to false when n_seq_max > 1 can cause bad performance in some cases
+    ///       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+    /// </remarks>
+    bool? SwaFull { get; }
 }
\ No newline at end of file
diff --git a/LLama/Batched/Conversation.cs b/LLama/Batched/Conversation.cs
index 7dbf1f8c3..fcc94ae8f 100644
--- a/LLama/Batched/Conversation.cs
+++ b/LLama/Batched/Conversation.cs
@@ -84,7 +84,7 @@ public void Dispose()
         _disposed = true;
 
         // Remove this conversation from the KV cache
-        Executor.Context.NativeHandle.KvCacheRemove(ConversationId, -1, -1);
+        Executor.Context.NativeHandle.MemorySequenceRemove(ConversationId, -1, -1);
 
         // Prevent finalizer from running
         GC.SuppressFinalize(this);
@@ -129,7 +129,7 @@ public Conversation Fork()
         _forked = true;
 
         // Assign tokens to the new sequence
-        Executor.Context.NativeHandle.KvCacheSequenceCopy(ConversationId, c.ConversationId, 0, _end);
+        Executor.Context.NativeHandle.MemorySequenceCopy(ConversationId, c.ConversationId, 0, _end);
 
         return c;
     }
@@ -406,7 +406,7 @@ internal KvAccessor(Conversation conversation)
         /// <param name="end">End position (exclusive)</param>
         public void Remove(LLamaPos start, LLamaPos end)
         {
-            _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
         }
 
         /// <summary>
@@ -420,7 +420,7 @@ public void Remove(LLamaPos start, int count)
                 return;
 
             var end = start.Value + count;
-            _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
         }
         #endregion
 
@@ -435,7 +435,7 @@ public void Remove(LLamaPos start, int count)
         /// <param name="delta">Amount to add on to each token position</param>
         public void Add(LLamaPos start, LLamaPos end, int delta)
         {
-            _conversation.Executor.Context.NativeHandle.KvCacheSequenceAdd(_conversation.ConversationId, start, end, delta);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceAdd(_conversation.ConversationId, start, end, delta);
         }
         #endregion
 
@@ -452,7 +452,7 @@ public void Divide(LLamaPos start, LLamaPos end, int divisor)
             if (divisor <= 0)
                 throw new ArgumentOutOfRangeException(nameof(divisor));
 
-            _conversation.Executor.Context.NativeHandle.KvCacheSequenceDivide(_conversation.ConversationId, start, end, divisor);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceDivide(_conversation.ConversationId, start, end, divisor);
         }
         #endregion
     }
diff --git a/LLama/ChatSession.cs b/LLama/ChatSession.cs
index bb1f91437..90119d4fe 100644
--- a/LLama/ChatSession.cs
+++ b/LLama/ChatSession.cs
@@ -199,7 +199,7 @@ public void LoadSession(SessionState state, bool loadTransforms = true)
         }
         if (state.ContextState is null)
         {
-            Executor.Context.NativeHandle.KvCacheClear();
+            Executor.Context.NativeHandle.MemoryClear();
         }
         else
         {
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
index 23f5681be..89737faa7 100644
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -112,6 +112,15 @@ public record ModelParams
         /// <inheritdoc />
         public bool VocabOnly { get; set; }
 
+        /// <inheritdoc />
+        public bool? OpOffload { get; set; }
+
+        /// <inheritdoc />
+        public bool? SwaFull { get; set; }
+
+        /// <inheritdoc />
+        public bool? KVUnified { get; set; }
+
         /// <summary>
         /// `Encoding` cannot be directly JSON serialized, instead store the name as a string which can
         /// </summary>
diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
index 54dd9873b..85e40f7ad 100644
--- a/LLama/Extensions/IContextParamsExtensions.cs
+++ b/LLama/Extensions/IContextParamsExtensions.cs
@@ -55,6 +55,13 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
 
             result.n_threads = Threads(@params.Threads);
             result.n_threads_batch = Threads(@params.BatchThreads);
+
+            if (@params.SwaFull.HasValue)
+                result.swa_full = @params.SwaFull.Value;
+            if (@params.OpOffload.HasValue)
+                result.op_offload = @params.OpOffload.Value;
+            if (@params.KVUnified.HasValue)
+                result.kv_unified = @params.KVUnified.Value;
         }
 
         private static int Threads(int? value)
diff --git a/LLama/LLamaExecutorBase.cs b/LLama/LLamaExecutorBase.cs
index 995cb3e4e..36989006e 100644
--- a/LLama/LLamaExecutorBase.cs
+++ b/LLama/LLamaExecutorBase.cs
@@ -128,7 +128,8 @@ public StatefulExecutorBase WithSessionFile(string filename)
             }
             if (File.Exists(filename))
             {
-                _logger?.LogInformation($"[LLamaExecutor] Attempting to load saved session from {filename}");
+                _logger?.LogInformation("[LLamaExecutor] Attempting to load saved session from {0}", filename);
+
                 var session_tokens = new LLamaToken[Context.ContextSize];
                 if (!NativeApi.llama_state_load_file(Context.NativeHandle, _pathSession, session_tokens, (ulong)Context.ContextSize, out var n_token_count_out))
                 {
@@ -136,7 +137,7 @@ public StatefulExecutorBase WithSessionFile(string filename)
                     throw new RuntimeError($"Failed to load session file {_pathSession}");
                 }
                 _session_tokens = session_tokens.Take((int)n_token_count_out).ToList();
-                _logger?.LogInformation($"[LLamaExecutor] Loaded a session with prompt size of {session_tokens.Length} tokens");
+                _logger?.LogInformation("[LLamaExecutor] Loaded a session with prompt size of {0} tokens", session_tokens.Length);
             }
             else
             {
@@ -190,11 +191,11 @@ protected virtual void HandleRunOutOfContext(int tokensToKeep)
             // if we run out of context:
             // - take the tokensToKeep first tokens from the original prompt (via n_past)
             // - take half of the last (n_ctx - tokensToKeep) tokens and recompute the logits in batches
-            int n_left = _pastTokensCount - tokensToKeep;
-            int n_discard = n_left / 2;
+            var n_left = _pastTokensCount - tokensToKeep;
+            var n_discard = n_left / 2;
 
-            NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);
-            NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
+            Context.NativeHandle.MemorySequenceRemove(LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);
+            Context.NativeHandle.MemorySequenceAdd(LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
 
             _pastTokensCount -= n_discard;
             // stop saving session if we run out of context
diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs
index fa42d7f35..16a206c40 100644
--- a/LLama/LLamaReranker.cs
+++ b/LLama/LLamaReranker.cs
@@ -114,7 +114,7 @@ public async Task<IReadOnlyList<float>> GetRelevanceScores(string input, IReadOn
             batch.Add(tokens[i], i, LLamaSeqId.Zero, true);
 
         // clear previous kv_cache values
-        Context.NativeHandle.KvCacheClear();
+        Context.NativeHandle.MemoryClear();
 
         // Check if we should cancel the work, just before doing anything expensive (encode/decode)
         cancellationToken.ThrowIfCancellationRequested();
@@ -144,7 +144,7 @@ public async Task<IReadOnlyList<float>> GetRelevanceScores(string input, IReadOn
 
         var score = Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero)[0];
 
-        Context.NativeHandle.KvCacheClear();
+        Context.NativeHandle.MemoryClear();
 
         return (normalize ? Sigmoid(score) : score, tokens.Length);
     }
@@ -155,7 +155,7 @@ private async Task<IReadOnlyList<float>> CalcRelevanceScores(LLamaBatch batch, b
         var seqNum = logicCap.Value + 1;
         List<float> scores = new List<float>(seqNum);
         // clear previous kv_cache values
-        Context.NativeHandle.KvCacheClear();
+        Context.NativeHandle.MemoryClear();
 
         // Check if we should cancel the work, just before doing anything expensive (encode/decode)
         cancellationToken.ThrowIfCancellationRequested();
@@ -189,7 +189,7 @@ private async Task<IReadOnlyList<float>> CalcRelevanceScores(LLamaBatch batch, b
             scores.Add(normalize ? Sigmoid(score) : score);
         }
 
-        Context.NativeHandle.KvCacheClear();
+        Context.NativeHandle.MemoryClear();
 
         return scores;
     }
diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets
index 0f67303dc..e4fb7c89a 100644
--- a/LLama/LLamaSharp.Runtime.targets
+++ b/LLama/LLamaSharp.Runtime.targets
@@ -76,37 +76,19 @@
       </None>
 
 
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/llama.dll">
-        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/win-x64/native/cuda11/llama.dll</Link>
-      </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/ggml-base.dll">
-        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/win-x64/native/cuda11/ggml-base.dll</Link>
-      </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/ggml.dll">
-        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/win-x64/native/cuda11/ggml.dll</Link>
-      </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/ggml-cuda.dll">
-        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/win-x64/native/cuda11/ggml-cuda.dll</Link>
-      </None>
-
-
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/llama.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.4.0/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/cuda12/llama.dll</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/ggml-base.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.4.0/ggml-base.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/cuda12/ggml-base.dll</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/ggml.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.4.0/ggml.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/cuda12/ggml.dll</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/ggml-cuda.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.4.0/ggml-cuda.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/cuda12/ggml-cuda.dll</Link>
       </None>
@@ -130,6 +112,29 @@
       </None>
 
 
+      <!-- Windows ARM64 -->
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/win-arm64/llama.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-arm64/native/llama.dll</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/win-arm64/ggml.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-arm64/native/ggml.dll</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/win-arm64/ggml-base.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-arm64/native/ggml-base.dll</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/win-arm64/ggml-cpu.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-arm64/native/ggml-cpu.dll</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/win-arm64/llava_shared.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-arm64/native/llava_shared.dll</Link>
+      </None>
+
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/libllama.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/noavx/libllama.so</Link>
@@ -218,43 +223,25 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-arm64/native/libggml-cpu.so</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/linux-arm64/libllava_shared.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/linux-arm64/libmtmd.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/linux-arm64/native/libllava_shared.so</Link>
+        <Link>runtimes/linux-arm64/native/libmtmd.so</Link>
       </None>
 
 
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/libllama.so">
-        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/linux-x64/native/cuda11/libllama.so</Link>
-      </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/libggml.so">
-        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/linux-x64/native/cuda11/libggml.so</Link>
-      </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/libggml-base.so">
-          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-          <Link>runtimes/linux-x64/native/cuda11/libggml-base.so</Link>
-      </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/libggml-cuda.so">
-        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/linux-x64/native/cuda11/libggml-cuda.so</Link>
-      </None>
-
-
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/libllama.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.4.0/libllama.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/cuda12/libllama.so</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/libggml.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.4.0/libggml.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/cuda12/libggml.so</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/libggml-base.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.4.0/libggml-base.so">
           <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
           <Link>runtimes/linux-x64/native/cuda12/libggml-base.so</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/libggml-cuda.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.4.0/libggml-cuda.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/cuda12/libggml-cuda.so</Link>
       </None>
@@ -371,9 +358,9 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/osx-arm64/native/libllama.dylib</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libllava_shared.dylib">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libmtmd.dylib">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/osx-arm64/native/libllava_shared.dylib</Link>
+        <Link>runtimes/osx-arm64/native/libmtmd.dylib</Link>
       </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/ggml-metal.metal">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
@@ -400,9 +387,9 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/osx-x64/native/libllama.dylib</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64/libllava_shared.dylib">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64/libmtmd.dylib">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/osx-x64/native/libllava_shared.dylib</Link>
+        <Link>runtimes/osx-x64/native/libmtmd.dylib</Link>
       </None>
 
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64-rosetta2/libggml-base.dylib">
@@ -425,67 +412,63 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/osx-x64/native/rosetta2/libllama.dylib</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64-rosetta2/libllava_shared.dylib">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64-rosetta2/libmtmd.dylib">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/osx-x64/native/rosetta2/libllava_shared.dylib</Link>
+        <Link>runtimes/osx-x64/native/rosetta2/libmtmd.dylib</Link>
       </None>
 
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/llava_shared.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/mtmd.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/win-x64/native/noavx/llava_shared.dll</Link>
+        <Link>runtimes/win-x64/native/noavx/libmtmd.dll</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/llava_shared.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/mtmd.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/win-x64/native/avx/llava_shared.dll</Link>
+        <Link>runtimes/win-x64/native/avx/libmtmd.dll</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/llava_shared.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/mtmd.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/win-x64/native/avx2/llava_shared.dll</Link>
+        <Link>runtimes/win-x64/native/avx2/libmtmd.dll</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/llava_shared.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/mtmd.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/win-x64/native/avx512/llava_shared.dll</Link>
+        <Link>runtimes/win-x64/native/avx512/libmtmd.dll</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/llava_shared.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.4.0/mtmd.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/win-x64/native/cuda11/llava_shared.dll</Link>
+        <Link>runtimes/win-x64/native/cuda12/libmtmd.dll</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/llava_shared.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/vulkan/mtmd.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/win-x64/native/cuda12/llava_shared.dll</Link>
+        <Link>runtimes/win-x64/native/vulkan/libmtmd.dll</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/vulkan/llava_shared.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/win-arm64/mtmd.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/win-x64/native/vulkan/llava_shared.dll</Link>
+        <Link>runtimes/win-arm64/native/libmtmd.dll</Link>
       </None>
 
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/libllava_shared.so">
-        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/linux-x64/native/noavx/libllava_shared.so</Link>
-      </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/libllava_shared.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/libmtmd.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/linux-x64/native/avx/libllava_shared.so</Link>
+        <Link>runtimes/linux-x64/native/noavx/libmtmd.so</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/libllava_shared.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/libmtmd.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/linux-x64/native/avx2/libllava_shared.so</Link>
+        <Link>runtimes/linux-x64/native/avx/libmtmd.so</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/libllava_shared.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/libmtmd.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/linux-x64/native/avx512/libllava_shared.so</Link>
+        <Link>runtimes/linux-x64/native/avx2/libmtmd.so</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/libllava_shared.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/libmtmd.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/linux-x64/native/cuda11/libllava_shared.so</Link>
+        <Link>runtimes/linux-x64/native/avx512/libmtmd.so</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/libllava_shared.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.4.0/libmtmd.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/linux-x64/native/cuda12/libllava_shared.so</Link>
+        <Link>runtimes/linux-x64/native/cuda12/libmtmd.so</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/vulkan/libllava_shared.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/vulkan/libmtmd.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-        <Link>runtimes/linux-x64/native/vulkan/libllava_shared.so</Link>
+        <Link>runtimes/linux-x64/native/vulkan/libmtmd.so</Link>
       </None>
     </ItemGroup>
 
@@ -513,8 +496,8 @@
             <Abi>x86</Abi>
         </AndroidNativeLibrary>
         <AndroidNativeLibrary Visible="false"
-            Include="$(MSBuildThisFileDirectory)runtimes/deps/android-x86/libllava_shared.so">
-            <Link>runtimes/android-x86/native/libllava_shared.so</Link>
+            Include="$(MSBuildThisFileDirectory)runtimes/deps/android-x86/libmtmd.so">
+            <Link>runtimes/android-x86/native/libmtmd.so</Link>
             <Abi>x86</Abi>
         </AndroidNativeLibrary>
     </ItemGroup>
@@ -542,8 +525,8 @@
             <Abi>x86_64</Abi>
         </AndroidNativeLibrary>
         <AndroidNativeLibrary Visible="false"
-            Include="$(MSBuildThisFileDirectory)runtimes/deps/android-x86_64/libllava_shared.so">
-            <Link>lib/x86_64/libllava_shared.so</Link>
+            Include="$(MSBuildThisFileDirectory)runtimes/deps/android-x86_64/libmtmd.so">
+            <Link>lib/x86_64/libmtmd.so</Link>
             <Abi>x86_64</Abi>
         </AndroidNativeLibrary>
     </ItemGroup>
@@ -571,8 +554,8 @@
             <Abi>arm64-v8a</Abi>
         </AndroidNativeLibrary>
         <AndroidNativeLibrary Visible="false"
-            Include="$(MSBuildThisFileDirectory)runtimes/deps/android-arm64-v8a/libllava_shared.so">
-            <Link>lib/arm64-v8a/libllava_shared.so</Link>
+            Include="$(MSBuildThisFileDirectory)runtimes/deps/android-arm64-v8a/libmtmd.so">
+            <Link>lib/arm64-v8a/libmtmd.so</Link>
             <Abi>arm64-v8a</Abi>
         </AndroidNativeLibrary>
     </ItemGroup>
diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
index 10476a121..15278427f 100644
--- a/LLama/LLamaSharp.csproj
+++ b/LLama/LLamaSharp.csproj
@@ -57,7 +57,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>ceda28ef8e310_v2</BinaryReleaseId>
+    <BinaryReleaseId>11dd5a44eb180e</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>
diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs
index 817738895..8f9b40cc3 100644
--- a/LLama/LLamaStatelessExecutor.cs
+++ b/LLama/LLamaStatelessExecutor.cs
@@ -158,8 +158,8 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
                     var n_left = n_past - tokensKeep;
                     var n_discard = n_left / 2;
 
-                    NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard);
-                    NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);
+                    Context.NativeHandle.MemorySequenceRemove(LLamaSeqId.Zero, tokensKeep, tokensKeep + n_discard);
+                    Context.NativeHandle.MemorySequenceAdd(LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
                 }
diff --git a/LLama/Native/DecodeResult.cs b/LLama/Native/DecodeResult.cs
index 8bf72c046..b0548b43e 100644
--- a/LLama/Native/DecodeResult.cs
+++ b/LLama/Native/DecodeResult.cs
@@ -1,4 +1,4 @@
-﻿namespace LLama.Native;
+namespace LLama.Native;
 
 /// <summary>
 /// Return codes from llama_decode
@@ -6,9 +6,9 @@
 public enum DecodeResult
 {
     /// <summary>
-    /// An unspecified error
+    /// Input batch was invalid
     /// </summary>
-    Error = -1,
+    InvalidInputBatch = -1,
 
     /// <summary>
     /// Ok.
diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
index 75b6be4bd..76f5d6c77 100644
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -101,7 +101,7 @@ public struct LLamaContextParams
         public uint yarn_orig_ctx;
 
         /// <summary>
-        /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt; 0 to disable (default)
+        /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt;= 0 to disable (default)
         /// </summary>
         public float defrag_threshold;
 
@@ -127,10 +127,17 @@ public struct LLamaContextParams
         /// </summary>
         public GGMLType type_v;
 
+        //todo: implement abort callback support
+        /// <summary>
+        /// ggml_abort_callback
+        /// </summary>
+        public IntPtr abort_callback;
+
+        //todo: implement abort callback support
         /// <summary>
-        /// Deprecated!
+        /// User data passed into abort_callback
         /// </summary>
-        private sbyte _logits_all;
+        public IntPtr abort_callback_user_data;
 
         /// <summary>
         /// if true, extract embeddings (together with logits)
@@ -172,17 +179,40 @@ public bool no_perf
         }
         private sbyte _no_perf;
 
-        //todo: implement abort callback support
         /// <summary>
-        /// ggml_abort_callback
+        /// offload host tensor operations to device
         /// </summary>
-        public IntPtr abort_callback;
+        public bool op_offload
+        {
+            readonly get => Convert.ToBoolean(_op_offload);
+            set => _op_offload = Convert.ToSByte(value);
+        }
+        private sbyte _op_offload;
 
-        //todo: implement abort callback support
         /// <summary>
-        /// User data passed into abort_callback
+        /// use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+        /// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
+        ///       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
         /// </summary>
-        public IntPtr abort_callback_user_data;
+        public bool swa_full
+        {
+            readonly get => Convert.ToBoolean(_swa_full);
+            set => _swa_full = Convert.ToSByte(value);
+        }
+        private sbyte _swa_full;
+
+        /// <summary>
+        /// use a unified buffer across the input sequences when computing the attention.
+        /// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
+        /// <br />
+        /// ref: https://github.com/ggml-org/llama.cpp/pull/14363
+        /// </summary>
+        public bool kv_unified
+        {
+            readonly get => Convert.ToBoolean(_kv_unified);
+            set => _kv_unified = Convert.ToSByte(value);
+        }
+        private sbyte _kv_unified;
 
         /// <summary>
         /// Get the default LLamaContextParams
diff --git a/LLama/Native/LLamaKvCache.cs b/LLama/Native/LLamaKvCache.cs
deleted file mode 100644
index 4a402f9ed..000000000
--- a/LLama/Native/LLamaKvCache.cs
+++ /dev/null
@@ -1,10 +0,0 @@
-namespace LLama.Native;
-
-/// <summary>
-/// C# representation of llama_kv_cache
-/// </summary>
-/// <remarks>llama_kv_cache</remarks>
-internal struct LLamaKvCacheNative
-{
-    
-}
\ No newline at end of file
diff --git a/LLama/Native/LLamaKvCacheView.cs b/LLama/Native/LLamaKvCacheView.cs
deleted file mode 100644
index 2fa513324..000000000
--- a/LLama/Native/LLamaKvCacheView.cs
+++ /dev/null
@@ -1,241 +0,0 @@
-using System;
-
-namespace LLama.Native;
-
-/// <summary>
-/// A safe handle for a LLamaKvCacheView
-/// </summary>
-public sealed class LLamaKvCacheViewSafeHandle
-    : SafeLLamaHandleBase
-{
-    private readonly SafeLLamaContextHandle _ctx;
-    private NativeLLamaKvCacheView _view;
-
-    /// <summary>
-    /// Number of KV cache cells. This will be the same as the context size.
-    /// </summary>
-    public int CellCount => GetNativeView().n_cells;
-
-    /// <summary>
-    /// Get the total number of tokens in the KV cache.
-    ///
-    /// For example, if there are two populated
-    /// cells, the first with 1 sequence id in it and the second with 2 sequence
-    /// ids then you'll have 3 tokens.
-    /// </summary>
-    public int TokenCount => GetNativeView().token_count;
-    
-    /// <summary>
-    /// Maximum number of sequences visible for a cell. There may be more sequences than this
-    /// in reality, this is simply the maximum number this view can see.
-    /// </summary>
-    public int MaxSequenceCount => GetNativeView().n_seq_max;
-    
-    /// <summary>
-    /// Number of populated cache cells
-    /// </summary>
-    public int UsedCellCount => GetNativeView().used_cells;
-
-    /// <summary>
-    /// Maximum contiguous empty slots in the cache.
-    /// </summary>
-    public int MaxContiguous => GetNativeView().max_contiguous;
-
-    /// <summary>
-    /// Index to the start of the MaxContiguous slot range. Can be negative when cache is full.
-    /// </summary>
-    public int MaxContiguousIdx => GetNativeView().max_contiguous;
-
-    /// <summary>
-    /// Initialize a LLamaKvCacheViewSafeHandle which will call `llama_kv_cache_view_free` when disposed
-    /// </summary>
-    /// <param name="ctx"></param>
-    /// <param name="view"></param>
-    private LLamaKvCacheViewSafeHandle(SafeLLamaContextHandle ctx, NativeLLamaKvCacheView view)
-        : base((IntPtr)1, true)
-    {
-        _ctx = ctx;
-        _view = view;
-    }
-
-    /// <summary>
-    /// Allocate a new KV cache view which can be used to inspect the KV cache
-    /// </summary>
-    /// <param name="ctx"></param>
-    /// <param name="maxSequences">The maximum number of sequences visible in this view per cell</param>
-    /// <returns></returns>
-    public static LLamaKvCacheViewSafeHandle Allocate(SafeLLamaContextHandle ctx, int maxSequences)
-    {
-        // Allocate the view
-        var view = llama_kv_cache_view_init(ctx, maxSequences);
-        var handle = new LLamaKvCacheViewSafeHandle(ctx, view);
-
-        // Update the view so it has valid data after allocation.
-        handle.Update();
-
-        return handle;
-    }
-
-    /// <inheritdoc />
-    protected override bool ReleaseHandle()
-    {
-        llama_kv_cache_view_free(ref _view);
-        SetHandle(IntPtr.Zero);
-
-        return true;
-    }
-
-    /// <summary>
-    /// Read the current KV cache state into this view.
-    /// </summary>
-    public void Update()
-    {
-        llama_kv_cache_view_update(_ctx, ref _view);
-    }
-
-    /// <summary>
-    /// Get the raw KV cache view
-    /// </summary>
-    /// <returns></returns>
-    private ref NativeLLamaKvCacheView GetNativeView()
-    {
-        if (IsClosed)
-            throw new ObjectDisposedException("Cannot access LLamaKvCacheViewSafeHandle after is has been disposed");
-
-        return ref _view;
-    }
-
-    /// <summary>
-    /// Get the cell at the given index
-    /// </summary>
-    /// <param name="index">The index of the cell [0, CellCount)</param>
-    /// <returns>Data about the cell at the given index</returns>
-    /// <exception cref="ArgumentOutOfRangeException">Thrown if index is out of range (0 &lt;= index &lt; CellCount)</exception>
-    public LLamaPos GetCell(int index)
-    {
-        var view = GetNativeView();
-
-        if (index < 0)
-            throw new ArgumentOutOfRangeException(nameof(index), "Cell index must be >= 0");
-        if (index >= view.n_cells)
-            throw new ArgumentOutOfRangeException(nameof(index), "Cell index must be < CellCount");
-        
-        unsafe
-        {
-            return view.cells[index].pos;
-        }
-    }
-
-    /// <summary>
-    /// Get all of the sequences assigned to the cell at the given index. This will contain <see cref="MaxSequenceCount"/> entries
-    /// sequences even if the cell actually has more than that many sequences, allocate a new view with a larger maxSequences parameter
-    /// if necessary. Invalid sequences will be negative values.
-    /// </summary>
-    /// <param name="index">The index of the cell [0, CellCount)</param>
-    /// <returns>A span containing the sequences assigned to this cell</returns>
-    /// <exception cref="ArgumentOutOfRangeException">Thrown if index is out of range (0 &lt;= index &lt; CellCount)</exception>
-    public Span<LLamaSeqId> GetCellSequences(int index)
-    {
-        var view = GetNativeView();
-        
-        if (index < 0)
-            throw new ArgumentOutOfRangeException(nameof(index), "Cell index  must be >= 0");
-        if (index >= view.n_cells)
-            throw new ArgumentOutOfRangeException(nameof(index), "Cell index must be < CellCount");
-        
-        unsafe
-        {
-            return new Span<LLamaSeqId>(&view.cells_sequences[index * view.n_seq_max], view.n_seq_max);
-        }
-    }
-
-    #region native API
-    /// <summary>
-    /// Create an empty KV cache view. (use only for debugging purposes)
-    /// </summary>
-    /// <param name="ctx"></param>
-    /// <param name="n_seq_max"></param>
-    /// <returns></returns>
-    [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-    private static extern NativeLLamaKvCacheView llama_kv_cache_view_init(SafeLLamaContextHandle ctx, int n_seq_max);
-    
-    /// <summary>
-    /// Free a KV cache view. (use only for debugging purposes)
-    /// </summary>
-    [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-    private static extern void llama_kv_cache_view_free(ref NativeLLamaKvCacheView view);
-    
-    /// <summary>
-    /// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
-    /// </summary>
-    /// <param name="ctx"></param>
-    /// <param name="view"></param>
-    [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-    private static extern void llama_kv_cache_view_update(SafeLLamaContextHandle ctx, ref NativeLLamaKvCacheView view);
-    
-    /// <summary>
-    /// Information associated with an individual cell in the KV cache view (llama_kv_cache_view_cell)
-    /// </summary>
-    [StructLayout(LayoutKind.Sequential)]
-    private struct NativeLLamaKvCacheViewCell
-    {
-        /// <summary>
-        /// The position for this cell. Takes KV cache shifts into account.
-        /// May be negative if the cell is not populated.
-        /// </summary>
-        public LLamaPos pos;
-    }
-    
-    /// <summary>
-    /// An updateable view of the KV cache (llama_kv_cache_view)
-    /// </summary>
-    [StructLayout(LayoutKind.Sequential)]
-    private unsafe struct NativeLLamaKvCacheView
-    {
-        /// <summary>
-        /// Number of KV cache cells. This will be the same as the context size.
-        /// </summary>
-        public int n_cells;
-        
-        /// <summary>
-        /// Maximum number of sequences that can exist in a cell. It's not an error
-        /// if there are more sequences in a cell than this value, however they will
-        /// not be visible in the view cells_sequences.
-        /// </summary>
-        public int n_seq_max;
-        
-        /// <summary>
-        /// Number of tokens in the cache. For example, if there are two populated
-        /// cells, the first with 1 sequence id in it and the second with 2 sequence
-        /// ids then you'll have 3 tokens.
-        /// </summary>
-        public int token_count;
-        
-        /// <summary>
-        /// Number of populated cache cells.
-        /// </summary>
-        public int used_cells;
-        
-        /// <summary>
-        /// Maximum contiguous empty slots in the cache.
-        /// </summary>
-        public int max_contiguous;
-        
-        /// <summary>
-        /// Index to the start of the max_contiguous slot range. Can be negative
-        /// when cache is full.
-        /// </summary>
-        public int max_contiguous_idx;
-        
-        /// <summary>
-        /// Information for an individual cell.
-        /// </summary>
-        public NativeLLamaKvCacheViewCell* cells;
-        
-        /// <summary>
-        /// The sequences for each cell. There will be n_seq_max items per cell.
-        /// </summary>
-        public LLamaSeqId* cells_sequences;
-    }
-    #endregion
-}
\ No newline at end of file
diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs
index d31b1bbc8..857f0cfb9 100644
--- a/LLama/Native/LLamaModelQuantizeParams.cs
+++ b/LLama/Native/LLamaModelQuantizeParams.cs
@@ -94,6 +94,11 @@ public bool keep_split
         /// </summary>
         public IntPtr tensor_types;
 
+        /// <summary>
+        /// Pointer to vector containing layer indices to prune
+        /// </summary>
+        public IntPtr prune_layers;
+
         /// <summary>
         /// Create a LLamaModelQuantizeParams with default values
         /// </summary>
diff --git a/LLama/Native/LLamaNativeBatch.cs b/LLama/Native/LLamaNativeBatch.cs
index 41817604a..e65fb5000 100644
--- a/LLama/Native/LLamaNativeBatch.cs
+++ b/LLama/Native/LLamaNativeBatch.cs
@@ -1,7 +1,7 @@
 namespace LLama.Native;
 
 /// <summary>
-/// Input data for llama_decode
+/// Input data for llama_encode/llama_decode
 /// A llama_batch object can contain input about one or many sequences
 /// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
 /// </summary>
@@ -25,7 +25,7 @@ public unsafe struct LLamaNativeBatch
 
     /// <summary>
     /// the positions of the respective token in the sequence
-    /// (if set to NULL, the token position will be tracked automatically by llama_decode)
+    /// (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
     /// </summary>
     public LLamaPos* pos;
 
@@ -41,8 +41,12 @@ public unsafe struct LLamaNativeBatch
     public LLamaSeqId** seq_id;
 
     /// <summary>
-    /// if zero, the logits for the respective token will not be output
-    /// (if set to NULL, only the logits for last token will be returned)
+    /// if zero, the logits for the respective token will not be output.
+    /// If set to NULL:
+    /// <list type="bullet">
+    ///  <item>If embeddings: all tokens are output</item>
+    ///  <item>If not: only the last token is output</item>
+    /// </list>
     /// </summary>
     public byte* logits;
 }
\ No newline at end of file
diff --git a/LLama/Native/LLamaTimings.cs b/LLama/Native/LLamaTimings.cs
index 25384cca4..24ab925e7 100644
--- a/LLama/Native/LLamaTimings.cs
+++ b/LLama/Native/LLamaTimings.cs
@@ -38,6 +38,11 @@ public struct LLamaPerfContextTimings
     /// number of eval calls
     /// </summary>
     private int n_eval;
+
+    /// <summary>
+    /// number of times a ggml compute graph had been reused
+    /// </summary>
+    private int n_reused;
     
     /// <summary>
     /// Timestamp when reset was last called
diff --git a/LLama/Native/LLamaVocabNative.cs b/LLama/Native/LLamaVocabNative.cs
index d4f990a81..05347aa4e 100644
--- a/LLama/Native/LLamaVocabNative.cs
+++ b/LLama/Native/LLamaVocabNative.cs
@@ -94,6 +94,14 @@ internal struct LLamaVocabNative
     [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
     public static extern unsafe LLamaToken llama_vocab_pad(LLamaVocabNative* vocab);
 
+    /// <summary>
+    /// mask
+    /// </summary>
+    /// <param name="vocab"></param>
+    /// <returns></returns>
+    [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+    public static extern unsafe LLamaToken llama_vocab_mask(LLamaVocabNative* vocab);
+
     [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
     public static extern unsafe LLamaToken llama_vocab_fim_pre(LLamaVocabNative* vocab);
 
@@ -119,4 +127,8 @@ internal struct LLamaVocabNative
     [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
     [return: MarshalAs(UnmanagedType.U1)]
     public static extern unsafe bool llama_vocab_get_add_eos(LLamaVocabNative* vocab);
+
+    [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+    [return: MarshalAs(UnmanagedType.U1)]
+    public static extern unsafe bool llama_vocab_get_add_sep(LLamaVocabNative* vocab);
 }
\ No newline at end of file
diff --git a/LLama/Native/LLamaVocabPreType.cs b/LLama/Native/LLamaVocabPreType.cs
deleted file mode 100644
index 48ab5585b..000000000
--- a/LLama/Native/LLamaVocabPreType.cs
+++ /dev/null
@@ -1,47 +0,0 @@
-namespace LLama.Native;
-
-/// <summary>
-/// 
-/// </summary>
-/// <remarks>llama_vocab_pre_type</remarks>
-// ReSharper disable InconsistentNaming
-internal enum LLamaVocabPreType
-{
-    Default = 0,
-
-    LLAMA3 = 1,
-    DEEPSEEK_LLM = 2,
-    DEEPSEEK_CODER = 3,
-    FALCON = 4,
-    MPT = 5,
-    STARCODER = 6,
-    GPT2 = 7,
-    REFACT = 8,
-    COMMAND_R = 9,
-    STABLELM2 = 10,
-    QWEN2 = 11,
-    OLMO = 12,
-    DBRX = 13,
-    SMAUG = 14,
-    PORO = 15,
-    CHATGLM3 = 16,
-    CHATGLM4 = 17,
-    VIKING = 18,
-    JAIS = 19,
-    TEKKEN = 20,
-    SMOLLM = 21,
-    CODESHELL = 22,
-    BLOOM = 23,
-    GPT3_FINNISH = 24,
-    EXAONE = 25,
-    CHAMELEON = 26,
-    MINERVA = 27,
-    DEEPSEEK3_LLM = 28,
-    GPT4O = 29,
-    SUPERBPE = 30,
-    TRILLION = 31,
-    BAILINGMOE = 32,
-    LLAMA4 = 33,
-    PIXTRAL = 34,
-}
-// ReSharper restore InconsistentNaming
\ No newline at end of file
diff --git a/LLama/Native/LLamaVocabType.cs b/LLama/Native/LLamaVocabType.cs
index bd7d704d9..1b5c6b970 100644
--- a/LLama/Native/LLamaVocabType.cs
+++ b/LLama/Native/LLamaVocabType.cs
@@ -35,4 +35,9 @@ public enum LLamaVocabType
     /// RWKV tokenizer based on greedy tokenization
     /// </summary>
     RWKV = 5,
+
+    /// <summary>
+    /// PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
+    /// </summary>
+    PLAMO2 = 6
 }
\ No newline at end of file
diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs
index 9f6457cd1..9ec996a20 100644
--- a/LLama/Native/Load/NativeLibraryUtils.cs
+++ b/LLama/Native/Load/NativeLibraryUtils.cs
@@ -219,7 +219,9 @@ public static void GetPlatformPathParts(OSPlatform platform, out string os, out
         {
             if (platform == OSPlatform.Windows)
             {
-                os = "win-x64";
+                os = System.Runtime.Intrinsics.Arm.ArmBase.Arm64.IsSupported
+                    ? "win-arm64"
+                    : "win-x64";
                 fileExtension = ".dll";
                 libPrefix = "";
                 return;
diff --git a/LLama/Native/Load/NativeLibraryWithAvx.cs b/LLama/Native/Load/NativeLibraryWithAvx.cs
index e6cbd86f3..3296fac0f 100644
--- a/LLama/Native/Load/NativeLibraryWithAvx.cs
+++ b/LLama/Native/Load/NativeLibraryWithAvx.cs
@@ -50,7 +50,7 @@ public IEnumerable<string> Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaL
         private string? GetAvxPath(SystemInfo systemInfo, AvxLevel avxLevel, NativeLogConfig.LLamaLogCallback? logCallback)
         {
             NativeLibraryUtils.GetPlatformPathParts(systemInfo.OSPlatform, out var os, out var fileExtension, out var libPrefix);
-            if (os != "linux-arm64"){
+            if (os != "linux-arm64" && os != "win-arm64"){
                 var avxStr = NativeLibraryConfig.AvxLevelToString(avxLevel);
                 if (!string.IsNullOrEmpty(avxStr))
                     avxStr += "/";
diff --git a/LLama/Native/NativeApi.Memory.cs b/LLama/Native/NativeApi.Memory.cs
new file mode 100644
index 000000000..24a406ab2
--- /dev/null
+++ b/LLama/Native/NativeApi.Memory.cs
@@ -0,0 +1,104 @@
+using System;
+
+namespace LLama.Native;
+
+public static partial class NativeApi
+{
+    /// <summary>
+    /// Clear the memory contents. If data == true, the data buffers will also be cleared together with the metadata
+    /// </summary>
+    /// <param name="mem"></param>
+    /// <param name="data"></param>
+    [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+    public static extern void llama_memory_clear(IntPtr /* llama_memory_t */ mem, [MarshalAs(UnmanagedType.U1)] bool data);
+
+    /// <summary>
+    /// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+    /// </summary>
+    /// <param name="mem"></param>
+    /// <param name="seq"></param>
+    /// <param name="p0"></param>
+    /// <param name="p1"></param>
+    /// <returns>Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails</returns>
+    [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+    [return: MarshalAs(UnmanagedType.U1)]
+    public static extern bool llama_memory_seq_rm(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq, LLamaPos p0, LLamaPos p1);
+
+    /// <summary>
+    /// Copy all tokens that belong to the specified sequence to another sequence
+    /// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
+    /// </summary>
+    /// <param name="mem"></param>
+    /// <param name="src"></param>
+    /// <param name="dest"></param>
+    /// <param name="p0">p0 &lt; 0 : [0,  p1]</param>
+    /// <param name="p1">p1 &lt; 0 : [p0, inf)</param>
+    [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+    internal static extern void llama_memory_seq_cp(IntPtr /* llama_memory_t */ mem, LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1);
+
+    /// <summary>
+    /// Removes all tokens that do not belong to the specified sequence
+    /// </summary>
+    /// <param name="mem"></param>
+    /// <param name="seq"></param>
+    [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+    internal static extern void llama_memory_seq_keep(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq);
+
+    /// <summary>
+    /// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+    /// </summary>
+    /// <param name="mem"></param>
+    /// <param name="seq"></param>
+    /// <param name="p0">p0 &lt; 0 : [0,  p1]</param>
+    /// <param name="p1">p1 &lt; 0 : [p0, inf)</param>
+    /// <param name="delta"></param>
+    [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+    internal static extern void llama_memory_seq_add(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta);
+
+    /// <summary>
+    /// Integer division of the positions by factor of `d > 1`
+    /// <br />
+    /// p0 &lt; 0 : [0,  p1]
+    /// <br />
+    /// p1 &lt; 0 : [p0, inf)
+    /// </summary>
+    /// <param name="mem"></param>
+    /// <param name="seq"></param>
+    /// <param name="p0">p0 &lt; 0 : [0,  p1]</param>
+    /// <param name="p1">p1 &lt; 0 : [p0, inf)</param>
+    /// <param name="d"></param>
+    [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+    internal static extern void llama_memory_seq_div(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d);
+
+    /// <summary>
+    /// Returns the smallest position present in the memory for the specified sequence.
+    /// This is typically non-zero only for SWA caches.
+    /// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory.
+    /// Return -1 if the sequence is empty.
+    /// </summary>
+    /// <param name="mem"></param>
+    /// <param name="seq"></param>
+    /// <returns></returns>
+    [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+    internal static extern LLamaPos llama_memory_seq_pos_min(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq);
+
+    /// <summary>
+    /// Returns the largest position present in the memory for the specified sequence.
+    /// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory.
+    /// Return -1 if the sequence is empty.
+    /// </summary>
+    /// <param name="mem"></param>
+    /// <param name="seq"></param>
+    /// <returns></returns>
+    [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+    internal static extern LLamaPos llama_memory_seq_pos_max(IntPtr /* llama_memory_t */ mem, LLamaSeqId seq);
+
+    /// <summary>
+    /// Check if the memory supports shifting
+    /// </summary>
+    /// <param name="mem"></param>
+    /// <returns></returns>
+    [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+    [return: MarshalAs(UnmanagedType.U1)]
+    internal static extern bool llama_memory_can_shift(IntPtr /* llama_memory_t */ mem);
+}
\ No newline at end of file
diff --git a/LLama/Native/NativeApi.Training.cs b/LLama/Native/NativeApi.Training.cs
new file mode 100644
index 000000000..ea1370b57
--- /dev/null
+++ b/LLama/Native/NativeApi.Training.cs
@@ -0,0 +1,35 @@
+namespace LLama.Native;
+
+public static partial class NativeApi
+{
+    ///// <summary>
+    ///// function that returns whether or not a given tensor contains trainable parameters
+    ///// </summary>
+    ///// <param name="ggml_tensor"></param>
+    ///// <param name="userdata"></param>
+    ///// <returns></returns>
+    //[return: MarshalAs(UnmanagedType.U1)]
+    //private unsafe delegate bool llama_opt_param_filter(void* ggml_tensor, void* userdata);
+
+    //private unsafe struct llama_opt_params
+    //{
+    //    uint n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
+
+    //    llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
+    //    void* param_filter_ud;               // userdata for determining which tensors contain trainable parameters
+
+    //    ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+
+    //    void* get_opt_pars_ud;                     // userdata for calculating optimizer parameters
+    //};
+
+    //internal static extern void llama_opt_init(SafeLLamaContextHandle ctx, SafeLLamaContextHandle model, llama_opt_params @params);
+
+    //internal static extern void llama_opt_epoch(SafeLLamaContextHandle ct,
+    //                                            ggml_opt_dataset_t dataset,
+    //                                            ggml_opt_result_t         result_train,
+    //                                            ggml_opt_result_t result_eval,
+    //                                            int64_t                   idata_split,
+    //                                            ggml_opt_epoch_callback callback_train,
+    //                                            ggml_opt_epoch_callback   callback_eval);
+}
\ No newline at end of file
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index 87cf02c78..db9e928bd 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -32,6 +32,13 @@ public static void llama_empty_call()
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern long llama_max_devices();
 
+        /// <summary>
+        /// Maximum number of parallel sequences
+        /// </summary>
+        /// <returns></returns>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern long llama_max_parallel_sequences();
+
         /// <summary>
         /// Check if memory mapping is supported
         /// </summary>
@@ -125,7 +132,7 @@ public static void llama_empty_call()
         public static extern void llama_set_causal_attn(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool causalAttn);
 
         /// <summary>
-        /// Set whether the model is in embeddings mode or not. 
+        /// Set whether the context outputs embeddings or not
         /// </summary>
         /// <param name="ctx"></param>
         /// <param name="embeddings">If true, embeddings will be returned but logits will not</param>
@@ -237,7 +244,7 @@ public static int llama_token_to_piece(SafeLlamaModelHandle.Vocabulary vocab, LL
         /// <param name="add_special">add_special Allow to add BOS and EOS tokens if model is configured to do so.</param>
         /// <param name="parse_special">Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.</param>
         /// <returns>Returns the number of tokens on success, no more than n_max_tokens.
-        /// Returns a negative number on failure - the number of tokens that would have been returned
+        /// Returns a negative number on failure - the number of tokens that would have been returned. Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
         /// </returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         internal static extern unsafe int llama_tokenize(LLamaVocabNative* model, byte* text, int text_len, LLamaToken* tokens, int n_max_tokens, [MarshalAs(UnmanagedType.U1)] bool add_special, [MarshalAs(UnmanagedType.U1)] bool parse_special);
@@ -266,111 +273,6 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
             NativeLogConfig.llama_log_set(logCallback);
         }
         
-        /// <summary>
-        /// Returns the number of tokens in the KV cache (slow, use only for debug)
-        /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <returns></returns>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        internal static extern int llama_kv_self_n_tokens(SafeLLamaContextHandle ctx);
-        
-        /// <summary>
-        /// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <returns></returns>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        internal static extern int llama_kv_self_used_cells(SafeLLamaContextHandle ctx);
-
-        /// <summary>
-        /// Clear the KV cache. Both cell info is erased and KV data is zeroed
-        /// </summary>
-        /// <param name="ctx"></param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        internal static extern void llama_kv_self_clear(SafeLLamaContextHandle ctx);
-
-        [Obsolete("Use `llama_kv_self_clear` instead")]
-        /// <summary>
-        /// Clear the KV cache. Both cell info is erased and KV data is zeroed
-        /// </summary>
-        /// <param name="ctx"></param>        
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        internal static extern void llama_kv_cache_clear(SafeLLamaContextHandle ctx);
-        
-        /// <summary>
-        /// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="seq"></param>
-        /// <param name="p0"></param>
-        /// <param name="p1"></param>
-        /// <returns>Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails</returns>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        [return: MarshalAs(UnmanagedType.U1)]
-        public static extern bool llama_kv_self_seq_rm(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1);
-
-        /// <summary>
-        /// Copy all tokens that belong to the specified sequence to another sequence
-        /// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="src"></param>
-        /// <param name="dest"></param>
-        /// <param name="p0"></param>
-        /// <param name="p1"></param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        internal static extern void llama_kv_self_seq_cp(SafeLLamaContextHandle ctx, LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1);
-
-        /// <summary>
-        /// Removes all tokens that do not belong to the specified sequence
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="seq"></param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        internal static extern void llama_kv_self_seq_keep(SafeLLamaContextHandle ctx, LLamaSeqId seq);
-
-        /// <summary>
-        /// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-        /// If the KV cache is RoPEd, the KV data is updated accordingly:
-        ///  - lazily on next llama_decode()
-        ///  - explicitly with llama_kv_self_update()
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="seq"></param>
-        /// <param name="p0"></param>
-        /// <param name="p1"></param>
-        /// <param name="delta"></param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        internal static extern void llama_kv_self_seq_add(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta);
-
-        /// <summary>
-        /// Integer division of the positions by factor of `d > 1`
-        /// If the KV cache is RoPEd, the KV data is updated accordingly:
-        ///   - lazily on next llama_decode()
-        ///   - explicitly with llama_kv_self_update()
-        /// <br />
-        /// p0 &lt; 0 : [0,  p1]
-        /// <br />
-        /// p1 &lt; 0 : [p0, inf)
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="seq"></param>
-        /// <param name="p0"></param>
-        /// <param name="p1"></param>
-        /// <param name="d"></param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        internal static extern void llama_kv_self_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d);
-
-        /// <summary>
-        /// Returns the largest position present in the KV cache for the specified sequence
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="seq"></param>
-        /// <returns></returns>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        internal static extern LLamaPos llama_kv_self_seq_pos_max(SafeLLamaContextHandle ctx, LLamaSeqId seq);
-
         /// <summary>
         /// Allocates a batch of tokens on the heap
         /// Each token can be assigned up to n_seq_max sequence ids
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index 467dd98e7..e26619b26 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -147,7 +147,9 @@ static SafeLLamaContextHandle()
         /// <param name="abort_callback"></param>
         /// <param name="abort_callback_data"></param>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        // ReSharper disable InconsistentNaming
         private static extern unsafe void llama_set_abort_callback(SafeLLamaContextHandle ctx, GgmlAbortCallback abort_callback, void* abort_callback_data);
+        // ReSharper restore InconsistentNaming
 
         /// <summary>
         /// If this returns true computation is cancelled
@@ -157,20 +159,27 @@ static SafeLLamaContextHandle()
         private unsafe delegate bool GgmlAbortCallback(void* data);
 
         /// <summary>
+        /// Process a batch of tokens.
+        /// Requires the context to have a memory.
+        /// For encode-decoder contexts, processes the batch using the decoder.
+        /// Positive return values does not mean a fatal error, but rather a warning.
+        /// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
+        ///   To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
+        /// Upon other return values, the memory state is restored to the state before this call
+        ///    0 - success
+        ///    1 - could not find a memory slot for the batch (try reducing the size of the batch or increase the context)
+        ///    2 - aborted     (processed ubatches will remain in the context's memory)
+        ///   -1 - invalid input batch
+        /// &lt; -1 - fatal error (processed ubatches will remain in the context's memory)
         /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="batch"></param>
-        /// <returns>Positive return values does not mean a fatal error, but rather a warning:<br />
-        ///  - 0: success<br />
-        ///  - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)<br />
-        ///  - &lt; 0: error<br />
-        /// </returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern int llama_decode(SafeLLamaContextHandle ctx, LLamaNativeBatch batch);
 
         /// <summary>
-        /// Processes a batch of tokens with the encoder part of the encoder-decoder model. Stores the encoder output
-        /// internally for later use by the decoder cross-attention layers.
+        /// Process a batch of tokens.
+        /// In contrast to llama_decode() - this call does not use KV cache.
+        /// For encode-decoder contexts, processes the batch using the encoder.
+        /// Can store the encoder output internally for later use by the decoder's cross-attention layers.
         /// </summary>
         /// <param name="ctx"></param>
         /// <param name="batch"></param>
@@ -186,7 +195,9 @@ static SafeLLamaContextHandle()
         /// <param name="n_threads_batch">n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)</param>
         /// <returns></returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        // ReSharper disable InconsistentNaming
         private static extern void llama_set_n_threads(SafeLLamaContextHandle ctx, int n_threads, int n_threads_batch);
+        // ReSharper restore InconsistentNaming
 
         /// <summary>
         /// Get the number of threads used for generation of a single token.
@@ -250,7 +261,7 @@ static SafeLLamaContextHandle()
         private static extern uint llama_n_ubatch(SafeLLamaContextHandle ctx);
 
         /// <summary>
-        /// Returns the **actual** size in bytes of the state (logits, embedding and kv_cache).
+        /// Returns the **actual** size in bytes of the state (logits, embedding and memory).
         /// Only use when saving the state, not when restoring it, otherwise the size may be too small.
         /// </summary>
         /// <param name="ctx"></param>
@@ -280,13 +291,13 @@ static SafeLLamaContextHandle()
         private static extern unsafe nuint llama_state_set_data(SafeLLamaContextHandle ctx, byte* src, nuint size);
 
         /// <summary>
-        /// Get the exact size needed to copy the KV cache of a single sequence
+        /// Get the exact size needed to copy the state of a single sequence
         /// </summary>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern nuint llama_state_seq_get_size(SafeLLamaContextHandle ctx, LLamaSeqId seqId);
 
         /// <summary>
-        /// Copy the KV cache of a single sequence into the specified buffer
+        /// Copy the state of a single sequence into the specified buffer
         /// </summary>
         /// <param name="ctx"></param>
         /// <param name="dst"></param>
@@ -310,31 +321,6 @@ static SafeLLamaContextHandle()
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern unsafe nuint llama_state_seq_set_data(SafeLLamaContextHandle ctx, byte* src, nuint size, LLamaSeqId destSeqId);
 
-        /// <summary>
-        /// Defragment the KV cache. This will be applied:
-        ///   - lazily on next llama_decode()
-        ///   - explicitly with llama_kv_self_update()
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <returns></returns>
-        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        private static extern void llama_kv_self_defrag(SafeLLamaContextHandle ctx);
-
-        /// <summary>
-        /// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-        /// </summary>
-        /// <param name="ctx"></param>
-        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        private static extern void llama_kv_self_update(SafeLLamaContextHandle ctx);
-
-        /// <summary>
-        /// Check if the context supports KV cache shifting
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <returns></returns>
-        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        private static extern bool llama_kv_self_can_shift(SafeLLamaContextHandle ctx);
-
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern LLamaPerfContextTimings llama_perf_context(SafeLLamaContextHandle ctx);
 
@@ -372,7 +358,7 @@ static SafeLLamaContextHandle()
         /// <summary>
         /// Get the embeddings for a sequence id.
         /// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-        /// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+        /// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
         /// otherwise: float[n_embd] (1-dimensional)
         /// </summary>
         /// <returns>A pointer to the first float in an embedding, length = ctx.EmbeddingSize</returns>
@@ -388,7 +374,7 @@ static SafeLLamaContextHandle()
         private static extern unsafe float* llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i);
 
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        private static extern LLamaKvCacheNative llama_get_kv_self(SafeLLamaContextHandle ctx);
+        private static extern IntPtr llama_get_memory(SafeLLamaContextHandle ctx);
 
         /// <summary>
         /// Set whether the model is in warmup mode or not
@@ -580,7 +566,7 @@ public void Synchronize()
         /// internally for later use by the decoder cross-attention layers.
         /// </summary>
         /// <param name="batch"></param>
-        /// <returns>0 = success <br />&lt; 0 = error (the KV cache state is restored to the state before this call)</returns>
+        /// <returns>0 = success <br />&lt; 0 = error (the memory state is restored to the state before this call)</returns>
         public DecodeResult Encode(LLamaBatch batch)
         {
             if (batch.TokenCount == 0)
@@ -592,13 +578,19 @@ public DecodeResult Encode(LLamaBatch batch)
         }
 
         /// <summary>
+        /// Process a batch of tokens.
+        /// Requires the context to have a memory.
+        /// For encode-decoder contexts, processes the batch using the decoder.
+        /// Positive return values does not mean a fatal error, but rather a warning.
+        /// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
+        ///   To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
+        /// Upon other return values, the memory state is restored to the state before this call
+        ///    0 - success
+        ///    1 - could not find a memory slot for the batch (try reducing the size of the batch or increase the context)
+        ///    2 - aborted     (processed ubatches will remain in the context's memory)
+        ///   -1 - invalid input batch
+        /// &lt; -1 - fatal error (processed ubatches will remain in the context's memory)
         /// </summary>
-        /// <param name="batch"></param>
-        /// <returns>Positive return values does not mean a fatal error, but rather a warning:<br />
-        ///  - 0: success<br />
-        ///  - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)<br />
-        ///  - &lt; 0: error (the KV cache state is restored to the state before this call)<br />
-        /// </returns>
         public DecodeResult Decode(LLamaBatch batch)
         {
             if (batch.TokenCount == 0)
@@ -617,6 +609,7 @@ public DecodeResult Decode(LLamaBatch batch)
         /// <param name="batch"></param>
         /// <param name="n_past"></param>
         /// <returns>A tuple, containing the decode result and the number of tokens that have <b>not</b> been decoded yet.</returns>
+        // ReSharper disable once InconsistentNaming
         internal (DecodeResult, int) Decode(List<LLamaToken> tokens, LLamaSeqId id, LLamaBatch batch, ref int n_past)
         {
             if (tokens.Count == 0)
@@ -645,15 +638,21 @@ public DecodeResult Decode(LLamaBatch batch)
 
             return (DecodeResult.Ok, 0);
         }
-        
+
         /// <summary>
+        /// Process a batch of tokens.
+        /// Requires the context to have a memory.
+        /// For encode-decoder contexts, processes the batch using the decoder.
+        /// Positive return values does not mean a fatal error, but rather a warning.
+        /// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
+        ///   To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
+        /// Upon other return values, the memory state is restored to the state before this call
+        ///    0 - success
+        ///    1 - could not find a memory slot for the batch (try reducing the size of the batch or increase the context)
+        ///    2 - aborted     (processed ubatches will remain in the context's memory)
+        ///   -1 - invalid input batch
+        /// &lt; -1 - fatal error (processed ubatches will remain in the context's memory)
         /// </summary>
-        /// <param name="batch"></param>
-        /// <returns>Positive return values does not mean a fatal error, but rather a warning:<br />
-        ///  - 0: success<br />
-        ///  - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)<br />
-        ///  - &lt; 0: error<br />
-        /// </returns>
         public DecodeResult Decode(LLamaBatchEmbeddings batch)
         {
             if (batch.EmbeddingsCount == 0)
@@ -675,7 +674,7 @@ public nuint GetStateSize()
         }
 
         /// <summary>
-        /// Get the size of the KV cache for a single sequence ID, when saved as bytes
+        /// Get the size of the memory state for a single sequence ID, when saved as bytes
         /// </summary>
         /// <param name="sequence"></param>
         /// <returns></returns>
@@ -759,66 +758,20 @@ public void ResetTimings()
         }
         #endregion
 
-        #region KV Cache Management
-        /// <summary>
-        /// Check if the context supports KV cache shifting
-        /// </summary>
-        public bool KvCacheCanShift => llama_kv_self_can_shift(this);
+        #region Memory Management
 
         /// <summary>
-        /// Apply KV cache updates (such as K-shifts, defragmentation, etc.)
+        /// Check if the context supports memory shifting
         /// </summary>
-        public void KvCacheUpdate()
-        {
-            llama_kv_self_update(this);
-        }
+        public bool MemoryCanShift => NativeApi.llama_memory_can_shift(llama_get_memory(this));
 
         /// <summary>
-        /// Defragment the KV cache. This will be applied:
-        ///   - lazily on next llama_decode()
-        ///   - explicitly with llama_kv_self_update()
+        /// Clear the memory
         /// </summary>
-        /// <returns></returns>
-        public void KvCacheDefrag()
-        {
-            llama_kv_self_defrag(this);
-        }
-
-        /// <summary>
-        /// Get a new KV cache view that can be used to debug the KV cache
-        /// </summary>
-        /// <param name="maxSequences"></param>
-        /// <returns></returns>
-        public LLamaKvCacheViewSafeHandle KvCacheGetDebugView(int maxSequences = 4)
-        {
-            return LLamaKvCacheViewSafeHandle.Allocate(this, maxSequences);
-        }
-
-        /// <summary>
-        /// Count the number of used cells in the KV cache (i.e. have at least one sequence assigned to them)
-        /// </summary>
-        /// <returns></returns>
-        public int KvCacheCountCells()
+        /// <param name="data">If true, the data buffers will also be cleared together with the metadata</param>
+        public void MemoryClear(bool data = true)
         {
-            return NativeApi.llama_kv_self_used_cells(this);
-        }
-
-        /// <summary>
-        /// Returns the number of tokens in the KV cache (slow, use only for debug)
-        /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-        /// </summary>
-        /// <returns></returns>
-        public int KvCacheCountTokens()
-        {
-            return NativeApi.llama_kv_self_n_tokens(this);
-        }
-
-        /// <summary>
-        /// Clear the KV cache - both cell info is erased and KV data is zeroed
-        /// </summary>
-        public void KvCacheClear()
-        {
-            NativeApi.llama_kv_self_clear(this);
+            NativeApi.llama_memory_clear(llama_get_memory(this), data);
         }
 
         /// <summary>
@@ -827,54 +780,52 @@ public void KvCacheClear()
         /// <param name="seq"></param>
         /// <param name="p0"></param>
         /// <param name="p1"></param>
-        public void KvCacheRemove(LLamaSeqId seq, LLamaPos p0, LLamaPos p1)
+        public void MemorySequenceRemove(LLamaSeqId seq, LLamaPos p0, LLamaPos p1)
         {
-            NativeApi.llama_kv_self_seq_rm(this, seq, p0, p1);
+            NativeApi.llama_memory_seq_rm(llama_get_memory(this), seq, p0, p1);
         }
 
         /// <summary>
         /// Copy all tokens that belong to the specified sequence to another sequence. Note that
-        /// this does not allocate extra KV cache memory - it simply assigns the tokens to the
+        /// this does not allocate extra memory - it simply assigns the tokens to the
         /// new sequence
         /// </summary>
         /// <param name="src"></param>
         /// <param name="dest"></param>
         /// <param name="p0"></param>
         /// <param name="p1"></param>
-        public void KvCacheSequenceCopy(LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1)
+        public void MemorySequenceCopy(LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1)
         {
-            NativeApi.llama_kv_self_seq_cp(this, src, dest, p0, p1);
+            NativeApi.llama_memory_seq_cp(llama_get_memory(this), src, dest, p0, p1);
         }
 
         /// <summary>
         /// Removes all tokens that do not belong to the specified sequence
         /// </summary>
         /// <param name="seq"></param>
-        public void KvCacheSequenceKeep(LLamaSeqId seq)
+        public void MemorySequenceKeep(LLamaSeqId seq)
         {
-            NativeApi.llama_kv_self_seq_keep(this, seq);
+            NativeApi.llama_memory_seq_keep(llama_get_memory(this), seq);
         }
 
         /// <summary>
         /// Adds relative position "delta" to all tokens that belong to the specified sequence
-        /// and have positions in [p0, p1. If the KV cache is RoPEd, the KV data is updated
-        /// accordingly
+        /// and have positions in [p0, p1)
         /// </summary>
         /// <param name="seq"></param>
         /// <param name="p0"></param>
         /// <param name="p1"></param>
         /// <param name="delta"></param>
-        public void KvCacheSequenceAdd(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta)
+        public void MemorySequenceAdd(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta)
         {
-            if (!KvCacheCanShift)
-                throw new InvalidOperationException("Cannot shift KV cache (KvCacheCanShift=False)");
+            if (!MemoryCanShift)
+                throw new InvalidOperationException("Cannot shift memory (MemoryCanShift == false)");
 
-            NativeApi.llama_kv_self_seq_add(this, seq, p0, p1, delta);
+            NativeApi.llama_memory_seq_add(llama_get_memory(this), seq, p0, p1, delta);
         }
 
         /// <summary>
-        /// Integer division of the positions by factor of `d > 1`.
-        /// If the KV cache is RoPEd, the KV data is updated accordingly.<br />
+        /// Integer division of the positions by factor of `d > 1`.<br />
         /// p0 &lt; 0 : [0,  p1]<br />
         /// p1 &lt; 0 : [p0, inf)
         /// </summary>
@@ -882,22 +833,32 @@ public void KvCacheSequenceAdd(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int del
         /// <param name="p0"></param>
         /// <param name="p1"></param>
         /// <param name="divisor"></param>
-        public void KvCacheSequenceDivide(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int divisor)
+        public void MemorySequenceDivide(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int divisor)
         {
-            if (!KvCacheCanShift)
-                throw new InvalidOperationException("Cannot shift KV cache (KvCacheCanShift=False)");
+            if (!MemoryCanShift)
+                throw new InvalidOperationException("Cannot shift memory (MemoryCanShift == false)");
 
-            NativeApi.llama_kv_self_seq_div(this, seq, p0, p1, divisor);
+            NativeApi.llama_memory_seq_add(llama_get_memory(this), seq, p0, p1, divisor);
+        }
+
+        /// <summary>
+        /// Returns the smallest position present in memory for the specified sequence
+        /// </summary>
+        /// <param name="seq"></param>
+        /// <returns></returns>
+        public LLamaPos MemorySequenceMinPosition(LLamaSeqId seq)
+        {
+            return NativeApi.llama_memory_seq_pos_min(llama_get_memory(this), seq);
         }
 
         /// <summary>
-        /// Returns the largest position present in the KV cache for the specified sequence
+        /// Returns the largest position present in memory for the specified sequence
         /// </summary>
         /// <param name="seq"></param>
         /// <returns></returns>
-        public LLamaPos KvCacheMaxPosition(LLamaSeqId seq)
+        public LLamaPos MemorySequenceMaxPosition(LLamaSeqId seq)
         {
-            return NativeApi.llama_kv_self_seq_pos_max(this, seq);
+            return NativeApi.llama_memory_seq_pos_max(llama_get_memory(this), seq);
         }
         #endregion
     }
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index 16336f706..d335a1209 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -2,6 +2,7 @@
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.IO;
+using System.Runtime.CompilerServices;
 using System.Text;
 using CommunityToolkit.HighPerformance.Buffers;
 using LLama.Exceptions;
@@ -58,7 +59,12 @@ public sealed class SafeLlamaModelHandle
         /// <summary>
         /// Get the number of KV heads in this model
         /// </summary>
-        public int KVHeadCount => llama_model_n_head(this);
+        public int KVHeadCount => llama_model_n_head_kv(this);
+
+        /// <summary>
+        /// Get the number of SWA in this model
+        /// </summary>
+        public int SWACount => llama_model_n_swa(this);
 
         /// <summary>
         /// Returns true if the model contains an encoder that requires llama_encode() call
@@ -140,6 +146,20 @@ public static SafeLlamaModelHandle LoadFromFile(string modelPath, LLamaModelPara
             return handle;
         }
 
+        /// <summary>
+        /// Save this model to a file
+        /// </summary>
+        /// <param name="modelPath"></param>
+        public void SaveToFile(string modelPath)
+        {
+            // If the file already exists, delete it. llama.cpp would overwrite, but doing this in C# has better errors in
+            // case of inaccessible/read-only files.
+            if (File.Exists(modelPath))
+                File.Delete(modelPath);
+
+            llama_model_save_to_file(this, modelPath);
+        }
+
         #region native API
         static SafeLlamaModelHandle()
         {
@@ -324,6 +344,14 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern int llama_model_n_head_kv(SafeLlamaModelHandle model);
 
+        /// <summary>
+        /// 
+        /// </summary>
+        /// <param name="model"></param>
+        /// <returns></returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern int llama_model_n_swa(SafeLlamaModelHandle model);
+
         /// <summary>
         /// Get a string describing the model type
         /// </summary>
@@ -398,6 +426,25 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
 
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern unsafe LLamaVocabNative* llama_model_get_vocab(SafeLlamaModelHandle model);
+
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern void llama_model_save_to_file(SafeLlamaModelHandle model, string path);
+
+        /// <summary>
+        /// Returns the number of classifier outputs (only valid for classifier models)
+        /// Undefined behavior for non-classifier models
+        /// </summary>
+        /// <param name="model"></param>
+        /// <returns></returns>
+        private static extern uint llama_model_n_cls_out(SafeLlamaModelHandle model);
+
+        /// <summary>
+        /// Returns label of classifier output by index (&lt;n_cls_out). Returns nullptr if no label provided
+        /// </summary>
+        /// <param name="model"></param>
+        /// <param name="i"></param>
+        /// <returns></returns>
+        private static extern string? llama_model_cls_label(SafeLlamaModelHandle model, uint i);
         #endregion
 
         #region LoRA
@@ -771,6 +818,20 @@ public LLamaToken? Pad
                 }
             }
 
+            /// <summary>
+            /// Get the masking token for this model
+            /// </summary>
+            public LLamaToken? Mask
+            {
+                get
+                {
+                    unsafe
+                    {
+                        return Normalize(LLamaVocabNative.llama_vocab_mask(VocabNative));
+                    }
+                }
+            }
+
             /// <summary>
             /// Get the sentence separator token for this model
             /// </summary>
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
index aeef403eb..db2693270 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
@@ -42,6 +42,13 @@
         <file src="runtimes/deps/avx512/llama.dll" target="runtimes\win-x64\native\avx512\llama.dll" />
         <file src="runtimes/deps/avx512/llava_shared.dll" target="runtimes\win-x64\native\avx512\llava_shared.dll" />
 
+        <!-- Windows ARM64 -->
+        <file src="runtimes/deps/win-arm64/ggml.dll" target="runtimes\win-arm64\native\ggml.dll" />
+        <file src="runtimes/deps/win-arm64/ggml-base.dll" target="runtimes\win-arm64\native\ggml-base.dll" />
+        <file src="runtimes/deps/win-arm64/ggml-cpu.dll" target="runtimes\win-arm64\native\ggml-cpu.dll" />
+        <file src="runtimes/deps/win-arm64/llama.dll" target="runtimes\win-arm64\native\llama.dll" />
+        <file src="runtimes/deps/win-arm64/llava_shared.dll" target="runtimes\win-arm64\native\llava_shared.dll" />
+
     <file src="runtimes/deps/noavx/libggml.so" target="runtimes\linux-x64\native\noavx\libggml.so" />
     <file src="runtimes/deps/noavx/libggml-base.so" target="runtimes\linux-x64\native\noavx\libggml-base.so" />
     <file src="runtimes/deps/noavx/libggml-cpu.so" target="runtimes\linux-x64\native\noavx\libggml-cpu.so" />
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec
deleted file mode 100644
index 6abd16ccc..000000000
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec
+++ /dev/null
@@ -1,34 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<package >
-    <metadata>
-        <id>LLamaSharp.Backend.Cuda11.Linux</id>
-        <version>$version$</version>
-        <title>LLamaSharp.Backend.Cuda11.Linux</title>
-        <authors>llama.cpp Authors</authors>
-        <requireLicenseAcceptance>false</requireLicenseAcceptance>
-        <license type="expression">MIT</license>
-        <icon>icon512.png</icon>
-        <projectUrl>https://github.com/SciSharp/LLamaSharp</projectUrl>
-        <description>LLamaSharp.Backend.Cuda11.Linux contains the Linux binaries for LLamaSharp with Cuda11 support.</description>
-        <releaseNotes></releaseNotes>
-        <copyright>Copyright 2023 The llama.cpp Authors. All rights reserved.</copyright>
-        <tags>LLamaSharp LLama LLM GPT AI ChatBot SciSharp</tags>
-        
-        <dependencies>
-            <dependency id="LLamaSharp.Backend.Cpu" version="$version$" />
-        </dependencies>
-    </metadata>
-
-    <files>
-        <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cuda11.props" />
-
-        <file src="runtimes/deps/cu11.7.1/libggml.so" target="runtimes/linux-x64/native/cuda11/libggml.so" />
-        <file src="runtimes/deps/cu11.7.1/libggml-base.so" target="runtimes/linux-x64/native/cuda11/libggml-base.so" />
-        <file src="runtimes/deps/cu11.7.1/libggml-cuda.so" target="runtimes/linux-x64/native/cuda11/libggml-cuda.so" />
-
-        <file src="runtimes/deps/cu11.7.1/libllama.so" target="runtimes/linux-x64/native/cuda11/libllama.so" />
-        <file src="runtimes/deps/cu11.7.1/libllava_shared.so" target="runtimes/linux-x64/native/cuda11/libllava_shared.so" />
-        
-        <file src="icon512.png" target="icon512.png" />
-    </files>
-</package>
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec
deleted file mode 100644
index a412e2e6f..000000000
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec
+++ /dev/null
@@ -1,34 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<package >
-    <metadata>
-        <id>LLamaSharp.Backend.Cuda11.Windows</id>
-        <version>$version$</version>
-        <title>LLamaSharp.Backend.Cuda11.Windows</title>
-        <authors>llama.cpp Authors</authors>
-        <requireLicenseAcceptance>false</requireLicenseAcceptance>
-        <license type="expression">MIT</license>
-        <icon>icon512.png</icon>
-        <projectUrl>https://github.com/SciSharp/LLamaSharp</projectUrl>
-        <description>LLamaSharp.Backend.Cuda11.Windows contains the Windows binaries for LLamaSharp with Cuda11 support.</description>
-        <releaseNotes></releaseNotes>
-        <copyright>Copyright 2023 The llama.cpp Authors. All rights reserved.</copyright>
-        <tags>LLamaSharp LLama LLM GPT AI ChatBot SciSharp</tags>
-
-        <dependencies>
-            <dependency id="LLamaSharp.Backend.Cpu" version="$version$" />
-        </dependencies>
-    </metadata>
-
-    <files>
-        <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cuda11.props" />
-
-        <file src="runtimes/deps/cu11.7.1/ggml.dll" target="runtimes\win-x64\native\cuda11\ggml.dll" />
-        <file src="runtimes/deps/cu11.7.1/ggml-base.dll" target="runtimes\win-x64\native\cuda11\ggml-base.dll" />
-        <file src="runtimes/deps/cu11.7.1/ggml-cuda.dll" target="runtimes\win-x64\native\cuda11\ggml-cuda.dll" />
-
-        <file src="runtimes/deps/cu11.7.1/llama.dll" target="runtimes\win-x64\native\cuda11\llama.dll" />
-        <file src="runtimes/deps/cu11.7.1/llava_shared.dll" target="runtimes\win-x64\native\cuda11\llava_shared.dll" />
-        
-        <file src="icon512.png" target="icon512.png" />
-    </files>
-</package>
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec
deleted file mode 100644
index 5ac473914..000000000
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec
+++ /dev/null
@@ -1,28 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<package >
-    <metadata>
-        <id>LLamaSharp.Backend.Cuda11</id>
-        <version>$version$</version>
-        <title>LLamaSharp.Backend.Cuda11, the backend for LLamaSharp</title>
-        <authors>llama.cpp Authors</authors>
-        <requireLicenseAcceptance>false</requireLicenseAcceptance>
-        <license type="expression">MIT</license>
-        <icon>icon512.png</icon>
-        <projectUrl>https://github.com/SciSharp/LLamaSharp</projectUrl>
-        <description>LLamaSharp.Backend.Cuda11 is a backend for LLamaSharp to use with Cuda11.</description>
-        <releaseNotes></releaseNotes>
-        <copyright>Copyright 2023 The llama.cpp Authors. All rights reserved.</copyright>
-        <tags>LLamaSharp LLama LLM GPT AI ChatBot SciSharp</tags>
-
-        <!-- Dependencies on platform-specific packages -->
-        <dependencies>
-            <dependency id="LLamaSharp.Backend.Cuda11.Windows" version="$version$" />
-            <dependency id="LLamaSharp.Backend.Cuda11.Linux" version="$version$" />
-        </dependencies>
-    </metadata>
-
-    <files>
-        <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cuda11.props" />
-        <file src="icon512.png" target="icon512.png" />
-    </files>
-</package>
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec
index 687283221..b372f1e1d 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec
@@ -22,12 +22,12 @@
     <files>
         <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cuda12.props" />
 
-        <file src="runtimes/deps/cu12.2.0/libggml.so" target="runtimes/linux-x64/native/cuda12/libggml.so" />
-        <file src="runtimes/deps/cu12.2.0/libggml-base.so" target="runtimes/linux-x64/native/cuda12/libggml-base.so" />
-        <file src="runtimes/deps/cu12.2.0/libggml-cuda.so" target="runtimes/linux-x64/native/cuda12/libggml-cuda.so" />
+        <file src="runtimes/deps/cu12.4.0/libggml.so" target="runtimes/linux-x64/native/cuda12/libggml.so" />
+        <file src="runtimes/deps/cu12.4.0/libggml-base.so" target="runtimes/linux-x64/native/cuda12/libggml-base.so" />
+        <file src="runtimes/deps/cu12.4.0/libggml-cuda.so" target="runtimes/linux-x64/native/cuda12/libggml-cuda.so" />
 
-        <file src="runtimes/deps/cu12.2.0/libllama.so" target="runtimes/linux-x64/native/cuda12/libllama.so" />
-        <file src="runtimes/deps/cu12.2.0/libllava_shared.so" target="runtimes/linux-x64/native/cuda12/libllava_shared.so" />
+        <file src="runtimes/deps/cu12.4.0/libllama.so" target="runtimes/linux-x64/native/cuda12/libllama.so" />
+        <file src="runtimes/deps/cu12.4.0/libllava_shared.so" target="runtimes/linux-x64/native/cuda12/libllava_shared.so" />
         
         <file src="icon512.png" target="icon512.png" />
     </files>
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec
index 1fd01edb9..38c003236 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec
@@ -22,12 +22,12 @@
     <files>
         <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cuda12.props" />
 
-        <file src="runtimes/deps/cu12.2.0/ggml.dll" target="runtimes\win-x64\native\cuda12\ggml.dll" />
-        <file src="runtimes/deps/cu12.2.0/ggml-base.dll" target="runtimes\win-x64\native\cuda12\ggml-base.dll" />
-        <file src="runtimes/deps/cu12.2.0/ggml-cuda.dll" target="runtimes\win-x64\native\cuda12\ggml-cuda.dll" />
+        <file src="runtimes/deps/cu12.4.0/ggml.dll" target="runtimes\win-x64\native\cuda12\ggml.dll" />
+        <file src="runtimes/deps/cu12.4.0/ggml-base.dll" target="runtimes\win-x64\native\cuda12\ggml-base.dll" />
+        <file src="runtimes/deps/cu12.4.0/ggml-cuda.dll" target="runtimes\win-x64\native\cuda12\ggml-cuda.dll" />
         
-        <file src="runtimes/deps/cu12.2.0/llama.dll" target="runtimes\win-x64\native\cuda12\llama.dll" />
-        <file src="runtimes/deps/cu12.2.0/llava_shared.dll" target="runtimes\win-x64\native\cuda12\llava_shared.dll" />
+        <file src="runtimes/deps/cu12.4.0/llama.dll" target="runtimes\win-x64\native\cuda12\llama.dll" />
+        <file src="runtimes/deps/cu12.4.0/llava_shared.dll" target="runtimes\win-x64\native\cuda12\llava_shared.dll" />
         
         <file src="icon512.png" target="icon512.png" />
     </files>
diff --git a/llama.cpp b/llama.cpp
index ceda28ef8..11dd5a44e 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit ceda28ef8e310a8dee60bf275077a3eedae8e36c
+Subproject commit 11dd5a44eb180e1d69fac24d3852b5222d66fb7f