SciSharp · jihadkhawaja · Jul 1, 2025 · Jul 23, 2025 · Jul 25, 2025 · Jul 26, 2025
diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
@@ -160,12 +160,16 @@ jobs:
         include:
           - build: 'noavx'
             defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
+            arch: 'x64'
           - build: 'avx2'
             defines: ''
+            arch: 'x64'
           - build: 'avx'
             defines: '-DGGML_AVX2=OFF'
+            arch: 'x64'
           - build: 'avx512'
             defines: '-DGGML_AVX512=ON -DGGML_AVX512_VBMI=ON -DGGML_AVX512_VNNI=ON'
+            arch: 'x64'
     runs-on: windows-latest
     steps:
       - uses: actions/checkout@v4
@@ -187,31 +191,89 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           path: .\build\bin\Release\llama.dll
-          name: llama-bin-win-${{ matrix.build }}-x64.dll
+          name: llama-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
           if-no-files-found: error
       - name: Upload artifacts (ggml)
         uses: actions/upload-artifact@v4
         with:
           path: .\build\bin\Release\ggml.dll
-          name: ggml-bin-win-${{ matrix.build }}-x64.dll
+          name: ggml-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
           if-no-files-found: error
       - name: Upload artifacts (ggml-base)
         uses: actions/upload-artifact@v4
         with:
           path: .\build\bin\Release\ggml-base.dll
-          name: ggml-base-bin-win-${{ matrix.build }}-x64.dll
+          name: ggml-base-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
           if-no-files-found: error
       - name: Upload artifacts (ggml-cpu)
         uses: actions/upload-artifact@v4
         with:
           path: .\build\bin\Release\ggml-cpu.dll
-          name: ggml-cpu-bin-win-${{ matrix.build }}-x64.dll
+          name: ggml-cpu-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
           if-no-files-found: error
       - name: Upload artifacts (mtmd)
         uses: actions/upload-artifact@v4
         with:
-          path: .\build\bin\Release\mtmd.dll
-          name: mtmd-bin-win-${{ matrix.build }}-x64.dll
+          path: .\build\bin\Release\llava_shared.dll
+          name: llava-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+          if-no-files-found: error
+
+  compile-windows-arm64:
+    name: Compile (Windows ARM64)
+    strategy:
+      fail-fast: true
+      matrix:
+        include:
+          - build: 'arm64'
+            defines: '-DCMAKE_GENERATOR_PLATFORM=ARM64 -DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
+            arch: 'arm64'
+    runs-on: windows-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          repository: ggerganov/llama.cpp
+          fetch-depth: 0
+          ref: '${{ github.event.inputs.llama_cpp_commit }}'
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. ${{ env.COMMON_DEFINE }} ${{ matrix.defines }}
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          tree /f
+
+      - name: Upload artifacts (llama)
+        uses: actions/upload-artifact@v4
+        with:
+          path: .\build\bin\Release\llama.dll
+          name: llama-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+          if-no-files-found: error
+      - name: Upload artifacts (ggml)
+        uses: actions/upload-artifact@v4
+        with:
+          path: .\build\bin\Release\ggml.dll
+          name: ggml-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+          if-no-files-found: error
+      - name: Upload artifacts (ggml-base)
+        uses: actions/upload-artifact@v4
+        with:
+          path: .\build\bin\Release\ggml-base.dll
+          name: ggml-base-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+          if-no-files-found: error
+      - name: Upload artifacts (ggml-cpu)
+        uses: actions/upload-artifact@v4
+        with:
+          path: .\build\bin\Release\ggml-cpu.dll
+          name: ggml-cpu-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
+          if-no-files-found: error
+
+      - name: Upload artifacts (llava)
+        uses: actions/upload-artifact@v4
+        with:
+          path: .\build\bin\Release\llava_shared.dll
+          name: llava-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
           if-no-files-found: error
 
   compile-vulkan:
@@ -603,6 +665,7 @@ jobs:
       "compile-linux",
       "compile-musl",
       "compile-windows",
+      "compile-windows-arm64",
       "compile-vulkan",
       "compile-cublas",
       "compile-macos",
@@ -617,7 +680,7 @@ jobs:
       - name: Rearrange Files
         run: |
           # Make all directories at once
-          mkdir --parents deps/{noavx,avx,avx2,avx512,linux-arm64,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu12.4.0,vulkan,android-arm64-v8a,android-x86,android-x86_64}
+          mkdir --parents deps/{noavx,avx,avx2,avx512,linux-arm64,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu11.7.1,cu12.2.0,vulkan,android-arm64-v8a,android-x86,android-x86_64,win-arm64}
 
           # Linux
           cp artifacts/ggml-bin-linux-noavx-x64.so/libggml.so           deps/noavx/libggml.so
@@ -701,6 +764,13 @@ jobs:
           cp artifacts/llama-bin-win-avx512-x64.dll/llama.dll         deps/avx512/llama.dll
           cp artifacts/mtmd-bin-win-avx512-x64.dll/mtmd.dll           deps/avx512/mtmd.dll
 
+          # Windows ARM64
+          cp artifacts/ggml-bin-win-arm64-arm64.dll/ggml.dll           deps/win-arm64/ggml.dll
+          cp artifacts/ggml-base-bin-win-arm64-arm64.dll/ggml-base.dll deps/win-arm64/ggml-base.dll
+          cp artifacts/ggml-cpu-bin-win-arm64-arm64.dll/ggml-cpu.dll   deps/win-arm64/ggml-cpu.dll
+          cp artifacts/llama-bin-win-arm64-arm64.dll/llama.dll         deps/win-arm64/llama.dll
+          cp artifacts/llava-bin-win-arm64-arm64.dll/llava_shared.dll  deps/win-arm64/llava_shared.dll
+
           # MacOS
           cp artifacts/ggml-bin-osx-arm64.dylib/libggml.dylib             deps/osx-arm64/libggml.dylib
           cp artifacts/ggml-base-bin-osx-arm64.dylib/libggml-base.dylib   deps/osx-arm64/libggml-base.dylib

diff --git a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
@@ -119,7 +119,7 @@ public void GlobalCleanup()
         {
             if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
             {
-                Executor.Context.NativeHandle.KvCacheClear();
+                Executor.Context.NativeHandle.MemoryClear();
             }
         }
 

diff --git a/LLama.Examples/Examples/BatchedExecutorSimple.cs b/LLama.Examples/Examples/BatchedExecutorSimple.cs
@@ -97,8 +97,8 @@ await AnsiConsole.Live(table).StartAsync(async ctx =>
 
                 // A generic error, this is fatal and the batch can no longer be used. This should never occur and generally indicates
                 // a bug in LLamaSharp, llama.cpp or a hardware error.
-                if (decodeResult == DecodeResult.Error)
-                    throw new Exception("Unknown error occurred while inferring.");
+                if (decodeResult != DecodeResult.Ok)
+                    throw new Exception($"Error occurred while inferring: {decodeResult}");
 
                 // After inference all of the conversations must be sampled before running inference again.
                 foreach (var conversationData in conversations)

diff --git a/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs b/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
@@ -79,7 +79,7 @@ public static async Task Run()
                     // When the prompt contains images we clear KV_CACHE to restart conversation
                     // See:
                     // https://github.com/ggerganov/llama.cpp/discussions/3620
-                    ex.Context.NativeHandle.KvCacheRemove( LLamaSeqId.Zero, -1, -1 );
+                    ex.Context.NativeHandle.MemorySequenceRemove( LLamaSeqId.Zero, -1, -1 );
 
                     int index = 0;
                     foreach (var path in imagePathsWithCurlyBraces)

diff --git a/LLama.Unittest/LLavaWeightsTests.cs b/LLama.Unittest/LLavaWeightsTests.cs
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -110,6 +110,15 @@ public class ModelOptions
         /// <inheritdoc />
         public bool VocabOnly { get; set; }
 
+        /// <inheritdoc />
+        public bool? OpOffload { get; set; }
+
+        /// <inheritdoc />
+        public bool? SwaFull { get; set; }
+
+        /// <inheritdoc />
+        public bool? KVUnified { get; set; }
+
         /// <inheritdoc />
         public float? DefragThreshold { get; set; }
 

diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
@@ -109,8 +109,7 @@ public interface IContextParams
     bool FlashAttention { get; }
 
     /// <summary>
-    /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt; 0 to disable (default)
-    /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to <see langword="null"/> or &lt; 0 to disable (default)
+    /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt;= 0 to disable (default)
     /// </summary>
     float? DefragThreshold { get; }
 
@@ -123,4 +122,25 @@ public interface IContextParams
     /// Attention type to use for embeddings
     /// </summary>
     LLamaAttentionType AttentionType { get; }
+
+    /// <summary>
+    /// Offload host tensor operations to device
+    /// </summary>
+    bool? OpOffload { get; }
+
+    /// <summary>
+    /// use a unified buffer across the input sequences when computing the attention.
+    /// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
+    /// <br />
+    /// ref: <a href="https://github.com/ggml-org/llama.cpp/pull/14363">https://github.com/ggml-org/llama.cpp/pull/14363</a>
+    /// </summary>
+    bool? KVUnified { get; }
+
+    /// <summary>
+    /// Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+    /// </summary>
+    /// <remarks>Setting to false when n_seq_max > 1 can cause bad performance in some cases
+    ///       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+    /// </remarks>
+    bool? SwaFull { get; }
 }
diff --git a/LLama/Batched/Conversation.cs b/LLama/Batched/Conversation.cs
@@ -84,7 +84,7 @@ public void Dispose()
         _disposed = true;
 
         // Remove this conversation from the KV cache
-        Executor.Context.NativeHandle.KvCacheRemove(ConversationId, -1, -1);
+        Executor.Context.NativeHandle.MemorySequenceRemove(ConversationId, -1, -1);
 
         // Prevent finalizer from running
         GC.SuppressFinalize(this);
@@ -129,7 +129,7 @@ public Conversation Fork()
         _forked = true;
 
         // Assign tokens to the new sequence
-        Executor.Context.NativeHandle.KvCacheSequenceCopy(ConversationId, c.ConversationId, 0, _end);
+        Executor.Context.NativeHandle.MemorySequenceCopy(ConversationId, c.ConversationId, 0, _end);
 
         return c;
     }
@@ -406,7 +406,7 @@ internal KvAccessor(Conversation conversation)
         /// <param name="end">End position (exclusive)</param>
         public void Remove(LLamaPos start, LLamaPos end)
         {
-            _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
         }
 
         /// <summary>
@@ -420,7 +420,7 @@ public void Remove(LLamaPos start, int count)
                 return;
 
             var end = start.Value + count;
-            _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
         }
         #endregion
 
@@ -435,7 +435,7 @@ public void Remove(LLamaPos start, int count)
         /// <param name="delta">Amount to add on to each token position</param>
         public void Add(LLamaPos start, LLamaPos end, int delta)
         {
-            _conversation.Executor.Context.NativeHandle.KvCacheSequenceAdd(_conversation.ConversationId, start, end, delta);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceAdd(_conversation.ConversationId, start, end, delta);
         }
         #endregion
 
@@ -452,7 +452,7 @@ public void Divide(LLamaPos start, LLamaPos end, int divisor)
             if (divisor <= 0)
                 throw new ArgumentOutOfRangeException(nameof(divisor));
 
-            _conversation.Executor.Context.NativeHandle.KvCacheSequenceDivide(_conversation.ConversationId, start, end, divisor);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceDivide(_conversation.ConversationId, start, end, divisor);
         }
         #endregion
     }

diff --git a/LLama/ChatSession.cs b/LLama/ChatSession.cs
@@ -199,7 +199,7 @@ public void LoadSession(SessionState state, bool loadTransforms = true)
         }
         if (state.ContextState is null)
         {
-            Executor.Context.NativeHandle.KvCacheClear();
+            Executor.Context.NativeHandle.MemoryClear();
         }
         else
         {

diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -112,6 +112,15 @@ public record ModelParams
         /// <inheritdoc />
         public bool VocabOnly { get; set; }
 
+        /// <inheritdoc />
+        public bool? OpOffload { get; set; }
+
+        /// <inheritdoc />
+        public bool? SwaFull { get; set; }
+
+        /// <inheritdoc />
+        public bool? KVUnified { get; set; }
+
         /// <summary>
         /// `Encoding` cannot be directly JSON serialized, instead store the name as a string which can
         /// </summary>

diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
@@ -55,6 +55,13 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
 
             result.n_threads = Threads(@params.Threads);
             result.n_threads_batch = Threads(@params.BatchThreads);
+
+            if (@params.SwaFull.HasValue)
+                result.swa_full = @params.SwaFull.Value;
+            if (@params.OpOffload.HasValue)
+                result.op_offload = @params.OpOffload.Value;
+            if (@params.KVUnified.HasValue)
+                result.kv_unified = @params.KVUnified.Value;
         }
 
         private static int Threads(int? value)
-Original file line number
+Diff line change
@@ Expand Up / @@ -119,7 +119,7 @@ public void GlobalCleanup() @@
             {
                 if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
                 {
-                    Executor.Context.NativeHandle.KvCacheClear();
+                    Executor.Context.NativeHandle.MemoryClear();
                 }
             }
@@ Expand Down @@