Skip to content

Windows arm64 cpu support #1226

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 77 additions & 7 deletions .github/workflows/compile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,16 @@ jobs:
include:
- build: 'noavx'
defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
arch: 'x64'
- build: 'avx2'
defines: ''
arch: 'x64'
- build: 'avx'
defines: '-DGGML_AVX2=OFF'
arch: 'x64'
- build: 'avx512'
defines: '-DGGML_AVX512=ON -DGGML_AVX512_VBMI=ON -DGGML_AVX512_VNNI=ON'
arch: 'x64'
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
Expand All @@ -187,31 +191,89 @@ jobs:
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\llama.dll
name: llama-bin-win-${{ matrix.build }}-x64.dll
name: llama-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error
- name: Upload artifacts (ggml)
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\ggml.dll
name: ggml-bin-win-${{ matrix.build }}-x64.dll
name: ggml-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error
- name: Upload artifacts (ggml-base)
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\ggml-base.dll
name: ggml-base-bin-win-${{ matrix.build }}-x64.dll
name: ggml-base-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error
- name: Upload artifacts (ggml-cpu)
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\ggml-cpu.dll
name: ggml-cpu-bin-win-${{ matrix.build }}-x64.dll
name: ggml-cpu-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error
- name: Upload artifacts (mtmd)
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\mtmd.dll
name: mtmd-bin-win-${{ matrix.build }}-x64.dll
path: .\build\bin\Release\llava_shared.dll
name: llava-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error

compile-windows-arm64:
name: Compile (Windows ARM64)
strategy:
fail-fast: true
matrix:
include:
- build: 'arm64'
defines: '-DCMAKE_GENERATOR_PLATFORM=ARM64 -DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
arch: 'arm64'
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
with:
repository: ggerganov/llama.cpp
fetch-depth: 0
ref: '${{ github.event.inputs.llama_cpp_commit }}'

- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. ${{ env.COMMON_DEFINE }} ${{ matrix.defines }}
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
tree /f

- name: Upload artifacts (llama)
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\llama.dll
name: llama-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error
- name: Upload artifacts (ggml)
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\ggml.dll
name: ggml-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error
- name: Upload artifacts (ggml-base)
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\ggml-base.dll
name: ggml-base-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error
- name: Upload artifacts (ggml-cpu)
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\ggml-cpu.dll
name: ggml-cpu-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error

- name: Upload artifacts (llava)
uses: actions/upload-artifact@v4
with:
path: .\build\bin\Release\llava_shared.dll
name: llava-bin-win-${{ matrix.build }}-${{ matrix.arch }}.dll
if-no-files-found: error

compile-vulkan:
Expand Down Expand Up @@ -603,6 +665,7 @@ jobs:
"compile-linux",
"compile-musl",
"compile-windows",
"compile-windows-arm64",
"compile-vulkan",
"compile-cublas",
"compile-macos",
Expand All @@ -617,7 +680,7 @@ jobs:
- name: Rearrange Files
run: |
# Make all directories at once
mkdir --parents deps/{noavx,avx,avx2,avx512,linux-arm64,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu12.4.0,vulkan,android-arm64-v8a,android-x86,android-x86_64}
mkdir --parents deps/{noavx,avx,avx2,avx512,linux-arm64,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu11.7.1,cu12.2.0,vulkan,android-arm64-v8a,android-x86,android-x86_64,win-arm64}

# Linux
cp artifacts/ggml-bin-linux-noavx-x64.so/libggml.so deps/noavx/libggml.so
Expand Down Expand Up @@ -701,6 +764,13 @@ jobs:
cp artifacts/llama-bin-win-avx512-x64.dll/llama.dll deps/avx512/llama.dll
cp artifacts/mtmd-bin-win-avx512-x64.dll/mtmd.dll deps/avx512/mtmd.dll

# Windows ARM64
cp artifacts/ggml-bin-win-arm64-arm64.dll/ggml.dll deps/win-arm64/ggml.dll
cp artifacts/ggml-base-bin-win-arm64-arm64.dll/ggml-base.dll deps/win-arm64/ggml-base.dll
cp artifacts/ggml-cpu-bin-win-arm64-arm64.dll/ggml-cpu.dll deps/win-arm64/ggml-cpu.dll
cp artifacts/llama-bin-win-arm64-arm64.dll/llama.dll deps/win-arm64/llama.dll
cp artifacts/llava-bin-win-arm64-arm64.dll/llava_shared.dll deps/win-arm64/llava_shared.dll

# MacOS
cp artifacts/ggml-bin-osx-arm64.dylib/libggml.dylib deps/osx-arm64/libggml.dylib
cp artifacts/ggml-base-bin-osx-arm64.dylib/libggml-base.dylib deps/osx-arm64/libggml-base.dylib
Expand Down
2 changes: 1 addition & 1 deletion LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ public void GlobalCleanup()
{
if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
{
Executor.Context.NativeHandle.KvCacheClear();
Executor.Context.NativeHandle.MemoryClear();
}
}

Expand Down
4 changes: 2 additions & 2 deletions LLama.Examples/Examples/BatchedExecutorSimple.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ await AnsiConsole.Live(table).StartAsync(async ctx =>

// A generic error, this is fatal and the batch can no longer be used. This should never occur and generally indicates
// a bug in LLamaSharp, llama.cpp or a hardware error.
if (decodeResult == DecodeResult.Error)
throw new Exception("Unknown error occurred while inferring.");
if (decodeResult != DecodeResult.Ok)
throw new Exception($"Error occurred while inferring: {decodeResult}");

// After inference all of the conversations must be sampled before running inference again.
foreach (var conversationData in conversations)
Expand Down
2 changes: 1 addition & 1 deletion LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public static async Task Run()
// When the prompt contains images we clear KV_CACHE to restart conversation
// See:
// https://github.com/ggerganov/llama.cpp/discussions/3620
ex.Context.NativeHandle.KvCacheRemove( LLamaSeqId.Zero, -1, -1 );
ex.Context.NativeHandle.MemorySequenceRemove( LLamaSeqId.Zero, -1, -1 );

int index = 0;
foreach (var path in imagePathsWithCurlyBraces)
Expand Down
54 changes: 0 additions & 54 deletions LLama.Unittest/LLavaWeightsTests.cs

This file was deleted.

9 changes: 9 additions & 0 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,15 @@ public class ModelOptions
/// <inheritdoc />
public bool VocabOnly { get; set; }

/// <inheritdoc />
public bool? OpOffload { get; set; }

/// <inheritdoc />
public bool? SwaFull { get; set; }

/// <inheritdoc />
public bool? KVUnified { get; set; }

/// <inheritdoc />
public float? DefragThreshold { get; set; }

Expand Down
24 changes: 22 additions & 2 deletions LLama/Abstractions/IContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,7 @@ public interface IContextParams
bool FlashAttention { get; }

/// <summary>
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt; 0 to disable (default)
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to <see langword="null"/> or &lt; 0 to disable (default)
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt;= 0 to disable (default)
/// </summary>
float? DefragThreshold { get; }

Expand All @@ -123,4 +122,25 @@ public interface IContextParams
/// Attention type to use for embeddings
/// </summary>
LLamaAttentionType AttentionType { get; }

/// <summary>
/// Offload host tensor operations to device
/// </summary>
bool? OpOffload { get; }

/// <summary>
/// use a unified buffer across the input sequences when computing the attention.
/// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
/// <br />
/// ref: <a href="https://github.com/ggml-org/llama.cpp/pull/14363">https://github.com/ggml-org/llama.cpp/pull/14363</a>
/// </summary>
bool? KVUnified { get; }

/// <summary>
/// Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
/// </summary>
/// <remarks>Setting to false when n_seq_max > 1 can cause bad performance in some cases
/// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
/// </remarks>
bool? SwaFull { get; }
}
12 changes: 6 additions & 6 deletions LLama/Batched/Conversation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ public void Dispose()
_disposed = true;

// Remove this conversation from the KV cache
Executor.Context.NativeHandle.KvCacheRemove(ConversationId, -1, -1);
Executor.Context.NativeHandle.MemorySequenceRemove(ConversationId, -1, -1);

// Prevent finalizer from running
GC.SuppressFinalize(this);
Expand Down Expand Up @@ -129,7 +129,7 @@ public Conversation Fork()
_forked = true;

// Assign tokens to the new sequence
Executor.Context.NativeHandle.KvCacheSequenceCopy(ConversationId, c.ConversationId, 0, _end);
Executor.Context.NativeHandle.MemorySequenceCopy(ConversationId, c.ConversationId, 0, _end);

return c;
}
Expand Down Expand Up @@ -406,7 +406,7 @@ internal KvAccessor(Conversation conversation)
/// <param name="end">End position (exclusive)</param>
public void Remove(LLamaPos start, LLamaPos end)
{
_conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
_conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
}

/// <summary>
Expand All @@ -420,7 +420,7 @@ public void Remove(LLamaPos start, int count)
return;

var end = start.Value + count;
_conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
_conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
}
#endregion

Expand All @@ -435,7 +435,7 @@ public void Remove(LLamaPos start, int count)
/// <param name="delta">Amount to add on to each token position</param>
public void Add(LLamaPos start, LLamaPos end, int delta)
{
_conversation.Executor.Context.NativeHandle.KvCacheSequenceAdd(_conversation.ConversationId, start, end, delta);
_conversation.Executor.Context.NativeHandle.MemorySequenceAdd(_conversation.ConversationId, start, end, delta);
}
#endregion

Expand All @@ -452,7 +452,7 @@ public void Divide(LLamaPos start, LLamaPos end, int divisor)
if (divisor <= 0)
throw new ArgumentOutOfRangeException(nameof(divisor));

_conversation.Executor.Context.NativeHandle.KvCacheSequenceDivide(_conversation.ConversationId, start, end, divisor);
_conversation.Executor.Context.NativeHandle.MemorySequenceDivide(_conversation.ConversationId, start, end, divisor);
}
#endregion
}
Expand Down
2 changes: 1 addition & 1 deletion LLama/ChatSession.cs
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ public void LoadSession(SessionState state, bool loadTransforms = true)
}
if (state.ContextState is null)
{
Executor.Context.NativeHandle.KvCacheClear();
Executor.Context.NativeHandle.MemoryClear();
}
else
{
Expand Down
9 changes: 9 additions & 0 deletions LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,15 @@ public record ModelParams
/// <inheritdoc />
public bool VocabOnly { get; set; }

/// <inheritdoc />
public bool? OpOffload { get; set; }

/// <inheritdoc />
public bool? SwaFull { get; set; }

/// <inheritdoc />
public bool? KVUnified { get; set; }

/// <summary>
/// `Encoding` cannot be directly JSON serialized, instead store the name as a string which can
/// </summary>
Expand Down
7 changes: 7 additions & 0 deletions LLama/Extensions/IContextParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo

result.n_threads = Threads(@params.Threads);
result.n_threads_batch = Threads(@params.BatchThreads);

if (@params.SwaFull.HasValue)
result.swa_full = @params.SwaFull.Value;
if (@params.OpOffload.HasValue)
result.op_offload = @params.OpOffload.Value;
if (@params.KVUnified.HasValue)
result.kv_unified = @params.KVUnified.Value;
}

private static int Threads(int? value)
Expand Down
Loading