From b46dc2f2e90792b59c6ceec1e142da41eeb276dc Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 8 Dec 2024 21:09:12 +0100
Subject: [PATCH 1/8] server : fix format_infill

---
 examples/server/server.cpp | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1c21e55aaa011..1d137a1d66f57 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3482,6 +3482,11 @@ int main(int argc, char ** argv) {
         json data = json::parse(req.body);
 
         // validate input
+        if (data.contains("prompt") && !data.at("prompt").is_string()) {
+            // prompt is optional
+            res_error(res, format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST));
+        }
+
         if (!data.contains("input_prefix")) {
             res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST));
         }
@@ -3491,9 +3496,11 @@ int main(int argc, char ** argv) {
         }
 
         if (data.contains("input_extra") && !data.at("input_extra").is_array()) {
+            // input_extra is optional
             res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST));
             return;
         }
+
         json input_extra = json_value(data, "input_extra", json::array());
         for (const auto & chunk : input_extra) {
             // { "text": string, "filename": string }
@@ -3509,6 +3516,21 @@ int main(int argc, char ** argv) {
         }
         data["input_extra"] = input_extra; // default to empty array if it's not exist
 
+        std::string prompt = json_value(data, "prompt", std::string());
+        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
+        SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
+        auto tokens = format_infill(
+            ctx_server.ctx,
+            data.at("input_prefix"),
+            data.at("input_suffix"),
+            data.at("input_extra"),
+            ctx_server.params_base.n_batch,
+            ctx_server.params_base.n_predict,
+            ctx_server.slots[0].n_ctx, // TODO: there should be a better way
+            ctx_server.params_base.spm_infill,
+            tokenized_prompts[0]
+        );
+
         return handle_completions_generic(SERVER_TASK_TYPE_INFILL, data, res);
     };
 

From 6ec3f77a4181cbe2dcad7994e15f315ec10c9019 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 8 Dec 2024 21:11:57 +0100
Subject: [PATCH 2/8] fix

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1d137a1d66f57..c58052ef322d2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3519,7 +3519,7 @@ int main(int argc, char ** argv) {
         std::string prompt = json_value(data, "prompt", std::string());
         std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
         SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
-        auto tokens = format_infill(
+        data["prompts"] = format_infill(
             ctx_server.ctx,
             data.at("input_prefix"),
             data.at("input_suffix"),

From a4d25724943b2c4688f433f2c85290328c7e6a68 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 8 Dec 2024 21:12:18 +0100
Subject: [PATCH 3/8] rename

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c58052ef322d2..c8bee9b430bf4 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3519,7 +3519,7 @@ int main(int argc, char ** argv) {
         std::string prompt = json_value(data, "prompt", std::string());
         std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
         SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
-        data["prompts"] = format_infill(
+        data["prompt"] = format_infill(
             ctx_server.ctx,
             data.at("input_prefix"),
             data.at("input_suffix"),

From ac2ea5382c595e5c444c4d3bfa263e12b3629ba3 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 8 Dec 2024 21:29:05 +0100
Subject: [PATCH 4/8] update test

---
 examples/server/tests/unit/test_infill.py | 22 +++++++++++++++++++++-
 examples/server/tests/utils.py            |  3 +++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py
index 6a6d40a1cbc8b..4fd2c63c5ad7b 100644
--- a/examples/server/tests/unit/test_infill.py
+++ b/examples/server/tests/unit/test_infill.py
@@ -34,7 +34,7 @@ def test_infill_with_input_extra():
         "input_suffix": "}\n",
     })
     assert res.status_code == 200
-    assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])
+    assert match_regex("(help|find|band)+", res.body["content"])
 
 
 @pytest.mark.parametrize("input_extra", [
@@ -55,3 +55,23 @@ def test_invalid_input_extra_req(input_extra):
     })
     assert res.status_code == 400
     assert "error" in res.body
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
+def test_with_qwen_model():
+    global server
+    server.model_file = None
+    server.model_hf_repo = "Qwen/CodeQwen1.5-7B-Chat-GGUF"
+    server.model_hf_file = "codeqwen-1_5-7b-chat-q2_k.gguf"
+    server.start(timeout_seconds=600)
+    res = server.make_request("POST", "/infill", data={
+        "prompt": "Complete this",
+        "input_extra": [{
+            "filename": "llama.h",
+            "text": "LLAMA_API int32_t llama_n_threads();\n"
+        }],
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert "n_threads" in res.body["content"]
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
index 69215eaa4ebb7..7c89b9cd37505 100644
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -371,3 +371,6 @@ def match_regex(regex: str, text: str) -> bool:
         ).search(text)
         is not None
     )
+
+def is_slow_test_allowed():
+    return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON"

From 5ffc2a02703dd22d612d61c61bd0fc28e2800a14 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 8 Dec 2024 21:36:28 +0100
Subject: [PATCH 5/8] use another model

---
 examples/server/tests/unit/test_infill.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py
index 4fd2c63c5ad7b..af9b00b75d3fe 100644
--- a/examples/server/tests/unit/test_infill.py
+++ b/examples/server/tests/unit/test_infill.py
@@ -61,11 +61,10 @@ def test_invalid_input_extra_req(input_extra):
 def test_with_qwen_model():
     global server
     server.model_file = None
-    server.model_hf_repo = "Qwen/CodeQwen1.5-7B-Chat-GGUF"
-    server.model_hf_file = "codeqwen-1_5-7b-chat-q2_k.gguf"
+    server.model_hf_repo = "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF"
+    server.model_hf_file = "qwen2.5-coder-1.5b-instruct-q2_k.gguf"
     server.start(timeout_seconds=600)
     res = server.make_request("POST", "/infill", data={
-        "prompt": "Complete this",
         "input_extra": [{
             "filename": "llama.h",
             "text": "LLAMA_API int32_t llama_n_threads();\n"
@@ -74,4 +73,4 @@ def test_with_qwen_model():
         "input_suffix": "}\n",
     })
     assert res.status_code == 200
-    assert "n_threads" in res.body["content"]
+    assert res.body["content"].startswith("n_threads")

From d47360e5a284a8e081e91231e232a4eddbedce58 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 8 Dec 2024 22:21:17 +0100
Subject: [PATCH 6/8] update test

---
 examples/server/tests/unit/test_infill.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py
index af9b00b75d3fe..4b0133406e20f 100644
--- a/examples/server/tests/unit/test_infill.py
+++ b/examples/server/tests/unit/test_infill.py
@@ -61,10 +61,11 @@ def test_invalid_input_extra_req(input_extra):
 def test_with_qwen_model():
     global server
     server.model_file = None
-    server.model_hf_repo = "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF"
-    server.model_hf_file = "qwen2.5-coder-1.5b-instruct-q2_k.gguf"
+    server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF"
+    server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf"
     server.start(timeout_seconds=600)
     res = server.make_request("POST", "/infill", data={
+        # "prompt": "Complete this", # FIXME: add more complicated prompt when format_infill is fixed
         "input_extra": [{
             "filename": "llama.h",
             "text": "LLAMA_API int32_t llama_n_threads();\n"
@@ -73,4 +74,4 @@ def test_with_qwen_model():
         "input_suffix": "}\n",
     })
     assert res.status_code == 200
-    assert res.body["content"].startswith("n_threads")
+    assert res.body["content"] == "n_threads();\n    printf(\"Number of threads: %d\\n\", n_threads);\n    return 0;\n"

From 055aa9e2ea16923200244eca138d5c83e983bb89 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 8 Dec 2024 22:53:00 +0100
Subject: [PATCH 7/8] update test

---
 examples/server/tests/unit/test_infill.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py
index 4b0133406e20f..e3527570997ff 100644
--- a/examples/server/tests/unit/test_infill.py
+++ b/examples/server/tests/unit/test_infill.py
@@ -13,28 +13,28 @@ def test_infill_without_input_extra():
     global server
     server.start()
     res = server.make_request("POST", "/infill", data={
-        "prompt": "Complete this",
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
         "input_suffix": "}\n",
     })
     assert res.status_code == 200
-    assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
+    assert match_regex("(Ann|small|shiny)+", res.body["content"])
 
 
 def test_infill_with_input_extra():
     global server
     server.start()
     res = server.make_request("POST", "/infill", data={
-        "prompt": "Complete this",
         "input_extra": [{
             "filename": "llama.h",
             "text": "LLAMA_API int32_t llama_n_threads();\n"
         }],
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
         "input_suffix": "}\n",
     })
     assert res.status_code == 200
-    assert match_regex("(help|find|band)+", res.body["content"])
+    assert match_regex("(Dad|excited|park)+", res.body["content"])
 
 
 @pytest.mark.parametrize("input_extra", [
@@ -65,12 +65,12 @@ def test_with_qwen_model():
     server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf"
     server.start(timeout_seconds=600)
     res = server.make_request("POST", "/infill", data={
-        # "prompt": "Complete this", # FIXME: add more complicated prompt when format_infill is fixed
         "input_extra": [{
             "filename": "llama.h",
             "text": "LLAMA_API int32_t llama_n_threads();\n"
         }],
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
         "input_suffix": "}\n",
     })
     assert res.status_code == 200

From 3a81c60698180a516e2b1b894d3a0e2f3bb59177 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 8 Dec 2024 22:58:58 +0100
Subject: [PATCH 8/8] test_invalid_input_extra_req

---
 examples/server/tests/unit/test_infill.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py
index e3527570997ff..ad4b8192a7875 100644
--- a/examples/server/tests/unit/test_infill.py
+++ b/examples/server/tests/unit/test_infill.py
@@ -48,9 +48,9 @@ def test_invalid_input_extra_req(input_extra):
     global server
     server.start()
     res = server.make_request("POST", "/infill", data={
-        "prompt": "Complete this",
         "input_extra": [input_extra],
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
         "input_suffix": "}\n",
     })
     assert res.status_code == 400