From a0a08eedb6a23b31d8783bbb91ede583cbe7933a Mon Sep 17 00:00:00 2001
From: kir-gadjello <111190790+kir-gadjello@users.noreply.github.com>
Date: Wed, 22 Nov 2023 02:16:38 -0300
Subject: [PATCH 01/11] Add openai-compatible POST /v1/chat/completions API
 endpoint to server example

---
 examples/server/server.cpp | 347 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 346 insertions(+), 1 deletion(-)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1f2c55f2dccdf..25c23d30bd65a 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -29,6 +29,8 @@
 #define SERVER_VERBOSE 1
 #endif
 
+#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
+
 using json = nlohmann::json;
 
 struct server_params
@@ -63,6 +65,10 @@ static bool server_verbose = false;
 // base64 utils (TODO: move to common in the future)
 //
 
+nlohmann::json oaicompat_completion_params_parse(
+    const nlohmann::json &body);
+std::string format_chatml(std::vector<json> messages);
+
 static const std::string base64_chars =
              "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
              "abcdefghijklmnopqrstuvwxyz"
@@ -377,6 +383,9 @@ struct llama_client_slot
     bool stopped_eos = false;
     bool stopped_word = false;
     bool stopped_limit = false;
+    
+    bool oaicompat = false;
+    std::string oaicompat_model = "";
 
     std::string stopping_word;
 
@@ -676,7 +685,16 @@ struct llama_server_context
     bool launch_slot_with_data(llama_client_slot* &slot, json data) {
         slot_params default_params;
         llama_sampling_params default_sparams;
-
+        
+        if (data.count("__oaicompat") != 0) {
+            slot->oaicompat = true;
+            slot->oaicompat_model =
+              json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+        } else {
+            slot->oaicompat = false;
+            slot->oaicompat_model = "";
+        }
+        
         slot->params.stream           = json_value(data, "stream",            false);
         slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
         slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
@@ -1169,6 +1187,12 @@ struct llama_server_context
             res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
         }
 
+        if (slot.oaicompat)
+        {
+            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
+            res.result_json["model"] = slot.oaicompat_model;
+        }
+
         queue_results.push_back(res);
     }
 
@@ -1216,6 +1240,12 @@ struct llama_server_context
             res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
         }
 
+        if (slot.oaicompat)
+        {
+            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
+            res.result_json["model"] = slot.oaicompat_model;
+        }
+
         queue_results.push_back(res);
     }
 
@@ -2178,6 +2208,249 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
     }
 }
 
+
+static std::string random_string() {
+  std::string str(
+      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+  std::random_device rd;
+  std::mt19937 generator(rd());
+
+  std::shuffle(str.begin(), str.end(), generator);
+
+  return str.substr(0, 32); // assumes 32 < number of characters in str
+}
+
+static std::string gen_chatcmplid() {
+  std::stringstream chatcmplid;
+  chatcmplid << "chatcmpl-" << random_string();
+  return chatcmplid.str();
+}
+
+std::string format_chatml(std::vector<json> messages) {
+
+  std::ostringstream chatml_msgs;
+
+  // iterate the array
+  for (auto it = messages.begin(); it != messages.end(); ++it) {
+    chatml_msgs << "<|im_start|>"
+                << json_value(*it, "role", std::string("user")) << '\n';
+    chatml_msgs << json_value(*it, "content", std::string(""))
+                << "<|im_end|>\n";
+  }
+
+  chatml_msgs << "<|im_start|>assistant" << '\n';
+
+  return chatml_msgs.str();
+}
+
+/* llama.cpp completion api semantics */
+nlohmann::json oaicompat_completion_params_parse(
+    const nlohmann::json &body /* openai api json semantics */) {
+  nlohmann::json llama_params;
+
+  llama_params["__oaicompat"] = true;
+
+  // Map OpenAI parameters to llama.cpp parameters
+  llama_params["prompt"] = format_chatml(
+      body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
+  llama_params["temperature"] =
+      json_value(body, "temperature", 0.8); // Default to 0.8 if not provided
+  llama_params["top_k"] =
+      json_value(body, "max_tokens", 40); // Default to 40 if not provided
+  llama_params["top_p"] =
+      json_value(body, "top_p", 0.95); // Default to 0.95 if not provided
+  llama_params["n_predict"] =
+      json_value(body, "max_tokens", -1); // Default to -1 if not provided
+  llama_params["logit_bias"] = json_value(
+      body, "logit_bias",
+      nlohmann::json::object()); // Default to empty object if not provided
+  llama_params["frequency_penalty"] = json_value(
+      body, "frequency_penalty", 0.0); // Default to 0.0 if not provided
+  llama_params["presence_penalty"] = json_value(
+      body, "presence_penalty", 0.0); // Default to 0.0 if not provided
+  llama_params["seed"] = json_value(body, "seed", 0);
+  llama_params["stream"] =
+      json_value(body, "stream", false); // Default to 0 if not provided
+  llama_params["mirostat"] =
+      json_value(body, "mirostat", false); // Default to false if not provided
+  llama_params["mirostat_tau"] =
+      json_value(body, "mirostat_tau", 0.0); // Default to 0.0 if not provided
+  llama_params["mirostat_eta"] =
+      json_value(body, "mirostat_eta", 0.0); // Default to 0.0 if not provided
+  llama_params["penalize_nl"] = json_value(
+      body, "penalize_nl", false); // Default to false if not provided
+  llama_params["typical_p"] =
+      json_value(body, "typical_p", 0.0); // Default to 0.0 if not provided
+  llama_params["repeat_last_n"] =
+      json_value(body, "repeat_last_n", 0); // Default to 0 if not provided
+  llama_params["ignore_eos"] =
+      json_value(body, "ignore_eos", false); // Default to false if not provided
+  llama_params["tfs_z"] =
+      json_value(body, "tfs_z", 0.0); // Default to 0.0 if not provided
+  if (llama_params.count("grammar") != 0) {
+    llama_params["grammar"] = json_value(
+        body, "grammar",
+        nlohmann::json::object()); // Default to empty object if not provided
+  }
+
+  // Handle 'stop' field
+  if (body["stop"].is_null()) {
+    llama_params["stop"] = json::array({});
+  } else if (body["stop"].is_string()) {
+    llama_params["stop"] = json::array({body["stop"].get<std::string>()});
+  } else {
+    llama_params["stop"] = json_value(
+        body, "stop",
+        json::array()); // Default to empty array if not provided
+  }
+
+  llama_params["stop"].push_back("<|im_end|>");
+
+  return llama_params;
+}
+
+static json format_final_response_oaicompat(json request, task_result response,
+                                            bool streaming = false) {
+
+  json result = response.result_json;
+
+  bool stopped_word = result.count("stopped_word") != 0;
+  bool stopped_eos = json_value(result, "stopped_eos", false);
+  int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+  int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
+  std::string content = json_value(result, "content", std::string(""));
+
+  std::string finish_reason = "length";
+  if (stopped_word || stopped_eos) {
+    finish_reason = "stop";
+  }
+
+  json choices =
+      streaming ? json::array({json{{"finish_reason", finish_reason},
+                                    {"index", 0},
+                                    {"delta", json::object()}}})
+                : json::array({json{{"finish_reason", finish_reason},
+                                    {"index", 0},
+                                    {"message", json{{"content", content},
+                                                     {"role", "assistant"}}}}});
+
+  std::time_t t = std::time(0);
+
+  json res =
+      json{{"choices", choices},
+           {"created", t},
+           {"model",
+            json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+           {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
+           {"usage",
+            json{{"completion_tokens", num_tokens_predicted},
+                 {"prompt_tokens", num_prompt_tokens},
+                 {"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
+           {"id", gen_chatcmplid()}};
+
+  if (server_verbose) {
+    res["__verbose"] = result;
+  }
+
+  if (result.contains("completion_probabilities")) {
+    res["completion_probabilities"] =
+        json_value(result, "completion_probabilities", json::array());
+  }
+
+  return res;
+}
+
+static std::vector<json> format_partial_response_oaicompat(task_result response) {
+  json result = response.result_json;
+
+  if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
+    return std::vector<json>({response.result_json});
+  }
+
+  bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
+  std::string modelname =
+      json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+
+  bool stopped_word = json_value(result, "stopped_word", false);
+  bool stopped_eos = json_value(result, "stopped_eos", false);
+  bool stopped_limit = json_value(result, "stopped_limit", false);
+  std::string content = json_value(result, "content", std::string(""));
+
+  std::string finish_reason = "";
+  if (stopped_word || stopped_eos) {
+    finish_reason = "stop";
+  }
+  if (stopped_limit) {
+    finish_reason = "length";
+  }
+
+  std::time_t t = std::time(0);
+
+  json choices;
+
+  if (!finish_reason.empty()) {
+    choices = json::array({json{{"finish_reason", finish_reason},
+                                {"index", 0},
+                                {"delta", json::object()}}});
+  } else {
+    if (first) {
+      if (content.empty()) {
+        choices = json::array({json{{"finish_reason", nullptr},
+                                    {"index", 0},
+                                    {"delta", json{{"role", "assistant"}}}}});
+      } else { 
+        // We have to send this as two updates to conform to openai behavior
+        json initial_ret = json{{"choices",
+                            json::array({json{
+                                {"finish_reason", nullptr},
+                                {"index", 0},
+                                {"delta", json{
+                                    {"role", "assistant"}
+                                }}}})},
+                  {"created", t},
+                  {"id", gen_chatcmplid()},
+                  {"model", modelname},
+                  {"object", "chat.completion.chunk"}};
+
+        json second_ret = json{{"choices",
+                            json::array({json{
+                                {"finish_reason", nullptr},
+                                {"index", 0},
+                                {"delta", json{
+                                    {"content", content}}}}})},
+                  {"created", t},
+                  {"id", gen_chatcmplid()},
+                  {"model", modelname},
+                  {"object", "chat.completion.chunk"}};
+        return std::vector<json>({initial_ret, second_ret});
+      }
+    } else {
+      // Some idosyncrasy in task processing logic makes several trailing calls
+      // with empty content, we ignore these at the calee site.
+      if (content.empty()) {
+        return std::vector<json>({json::object()});
+      }
+      choices = json::array({json{
+          {"finish_reason", nullptr},
+          {"index", 0},
+          {"delta",
+           json{
+               {"content", content},
+           }},
+      }});
+    }
+  }
+
+  json ret = json{{"choices", choices},
+                  {"created", t},
+                  {"id", gen_chatcmplid()},
+                  {"model", modelname},
+                  {"object", "chat.completion.chunk"}};
+    
+  return std::vector<json>({ret});
+}
+
 static json format_partial_response(
     llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
 ) {
@@ -2396,6 +2669,78 @@ int main(int argc, char **argv)
                 }
             });
 
+
+  svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req,
+                                            httplib::Response &res) {
+    json data = oaicompat_completion_params_parse(json::parse(req.body));
+
+    const int task_id = llama.request_completion(data, false, false);
+    if (!json_value(data, "stream", false)) {
+      std::string completion_text;
+      task_result result = llama.next_result(task_id);
+
+      if (!result.error && result.stop) {
+        json oaicompat_result = format_final_response_oaicompat(data, result);
+
+        res.set_content(oaicompat_result.dump(-1, ' ', false,
+                                              json::error_handler_t::replace),
+                        "application/json");
+      } else {
+        res.status = 500;
+        res.set_content(result.result_json["content"], "text/plain");
+        return;
+      }
+    } else {
+      const auto chunked_content_provider = [task_id, &llama](size_t,
+                                                    httplib::DataSink &sink) {
+        while (true) {
+          task_result llama_result = llama.next_result(task_id);
+          if (!llama_result.error) {
+            std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
+            
+            for (auto it = result_array.begin(); it != result_array.end(); ++it)
+            {
+                if (!it->empty()) {
+                    const std::string str =
+                        "data: " +
+                        it->dump(-1, ' ', false, json::error_handler_t::replace) +
+                        "\n\n";
+                    LOG_VERBOSE("data stream", {{"to_send", str}});
+                    if (!sink.write(str.c_str(), str.size())) {
+                        return false;
+                    }
+                }
+            }
+            if (llama_result.stop) {
+                break;
+            }
+          } else {
+            const std::string str =
+                "error: " +
+                llama_result.result_json.dump(-1, ' ', false,
+                                              json::error_handler_t::replace) +
+                "\n\n";
+            LOG_VERBOSE("data stream", {{"to_send", str}});
+            if (!sink.write(str.c_str(), str.size())) {
+              return false;
+            }
+            break;
+          }
+        }
+        sink.done();
+        return true;
+      };
+
+      auto on_complete = [task_id, &llama](bool) {
+        // cancel
+        llama.request_cancel(task_id);
+      };
+
+      res.set_chunked_content_provider("text/event-stream",
+                                       chunked_content_provider, on_complete);
+    }
+  });
+    
     svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
             {
                 json data = json::parse(req.body);

From 2f84f5dc84ba7e190309f87193592468eeb4cd4d Mon Sep 17 00:00:00 2001
From: kir-gadjello <111190790+kir-gadjello@users.noreply.github.com>
Date: Wed, 22 Nov 2023 02:40:47 -0300
Subject: [PATCH 02/11] fix code style

---
 examples/server/server.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 25c23d30bd65a..98552a83139a4 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -65,8 +65,8 @@ static bool server_verbose = false;
 // base64 utils (TODO: move to common in the future)
 //
 
-nlohmann::json oaicompat_completion_params_parse(
-    const nlohmann::json &body);
+json oaicompat_completion_params_parse(
+    const json &body);
 std::string format_chatml(std::vector<json> messages);
 
 static const std::string base64_chars =
@@ -2245,9 +2245,9 @@ std::string format_chatml(std::vector<json> messages) {
 }
 
 /* llama.cpp completion api semantics */
-nlohmann::json oaicompat_completion_params_parse(
-    const nlohmann::json &body /* openai api json semantics */) {
-  nlohmann::json llama_params;
+json oaicompat_completion_params_parse(
+    const json &body /* openai api json semantics */) {
+  json llama_params;
 
   llama_params["__oaicompat"] = true;
 
@@ -2264,7 +2264,7 @@ nlohmann::json oaicompat_completion_params_parse(
       json_value(body, "max_tokens", -1); // Default to -1 if not provided
   llama_params["logit_bias"] = json_value(
       body, "logit_bias",
-      nlohmann::json::object()); // Default to empty object if not provided
+      json::object()); // Default to empty object if not provided
   llama_params["frequency_penalty"] = json_value(
       body, "frequency_penalty", 0.0); // Default to 0.0 if not provided
   llama_params["presence_penalty"] = json_value(
@@ -2291,7 +2291,7 @@ nlohmann::json oaicompat_completion_params_parse(
   if (llama_params.count("grammar") != 0) {
     llama_params["grammar"] = json_value(
         body, "grammar",
-        nlohmann::json::object()); // Default to empty object if not provided
+        json::object()); // Default to empty object if not provided
   }
 
   // Handle 'stop' field

From af4d68b22d28e9c3bb6fc8467a8872649840adac Mon Sep 17 00:00:00 2001
From: kir-gadjello <111190790+kir-gadjello@users.noreply.github.com>
Date: Wed, 22 Nov 2023 03:55:23 -0300
Subject: [PATCH 03/11] Update server README.md

---
 examples/server/README.md | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/examples/server/README.md b/examples/server/README.md
index a6eda3b32d576..be13529fc03bb 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -234,6 +234,39 @@ node index.js
 
 -   **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
 
+-   **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
+
+    *Options:*
+
+    See (OpenAI Chat Completions API documentation)[https://platform.openai.com/docs/api-reference/chat]. While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported.
+
+    *Examples:*
+
+    You can use either Python `openai` library with appropriate checkpoints, or raw HTTP requests:
+
+    ```python
+    openai.api_base = "http://<Your api-server IP>:port"
+    ```
+
+    ```shell
+    curl http://localhost:8080/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer no-key" \
+    -d '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
+    {
+        "role": "system",
+        "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
+    },
+    {
+        "role": "user",
+        "content": "Write a limerick about python exceptions"
+    }
+    ]
+    }'
+    ```
+
 ## More examples
 
 ### Change system prompt on runtime

From 9ad4d273e11deb2ebed201d846bb97ae0129320e Mon Sep 17 00:00:00 2001
From: kir-gadjello <111190790+kir-gadjello@users.noreply.github.com>
Date: Wed, 22 Nov 2023 04:17:12 -0300
Subject: [PATCH 04/11] Improve server README.md

---
 examples/server/README.md | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index be13529fc03bb..cfc220f5810b3 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -234,19 +234,35 @@ node index.js
 
 -   **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
 
--   **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
+-   **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
 
     *Options:*
 
-    See (OpenAI Chat Completions API documentation)[https://platform.openai.com/docs/api-reference/chat]. While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported.
+    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported.
 
     *Examples:*
 
-    You can use either Python `openai` library with appropriate checkpoints, or raw HTTP requests:
+    You can use either Python `openai` library with appropriate checkpoints:
 
     ```python
-    openai.api_base = "http://<Your api-server IP>:port"
+    import openai
+
+    client = openai.OpenAI(
+        base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+        api_key = "sk-no-key-required"
+    )
+
+    completion = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages=[
+        {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
+        {"role": "user", "content": "Write a limerick about python exceptions"}
+    ]
+    )
+
+    print(completion.choices[0].message)
     ```
+    ... or raw HTTP requests:
 
     ```shell
     curl http://localhost:8080/v1/chat/completions \

From e1516709f217ac8d342b28e9b9a2c0e74b57310b Mon Sep 17 00:00:00 2001
From: kir-gadjello <111190790+kir-gadjello@users.noreply.github.com>
Date: Wed, 22 Nov 2023 22:35:57 -0300
Subject: [PATCH 05/11] Fix server.cpp code style according to review

---
 examples/server/server.cpp | 549 ++++++++++++++++++-------------------
 1 file changed, 267 insertions(+), 282 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 98552a83139a4..54455ad9a7367 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -61,14 +61,14 @@ static bool server_verbose = false;
 #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
 
+json oaicompat_completion_params_parse(const json &body);
+std::string format_chatml(std::vector<json> messages);
+
+
 //
 // base64 utils (TODO: move to common in the future)
 //
 
-json oaicompat_completion_params_parse(
-    const json &body);
-std::string format_chatml(std::vector<json> messages);
-
 static const std::string base64_chars =
              "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
              "abcdefghijklmnopqrstuvwxyz"
@@ -688,8 +688,7 @@ struct llama_server_context
         
         if (data.count("__oaicompat") != 0) {
             slot->oaicompat = true;
-            slot->oaicompat_model =
-              json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+            slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
         } else {
             slot->oaicompat = false;
             slot->oaicompat_model = "";
@@ -2209,246 +2208,232 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 }
 
 
-static std::string random_string() {
-  std::string str(
-      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+static std::string random_string()
+{
+    std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
 
-  std::random_device rd;
-  std::mt19937 generator(rd());
+    std::random_device rd;
+    std::mt19937 generator(rd());
 
-  std::shuffle(str.begin(), str.end(), generator);
+    std::shuffle(str.begin(), str.end(), generator);
 
-  return str.substr(0, 32); // assumes 32 < number of characters in str
+    return str.substr(0, 32); // assumes 32 < number of characters in str
 }
 
-static std::string gen_chatcmplid() {
-  std::stringstream chatcmplid;
-  chatcmplid << "chatcmpl-" << random_string();
-  return chatcmplid.str();
+static std::string gen_chatcmplid()
+{
+    std::stringstream chatcmplid;
+    chatcmplid << "chatcmpl-" << random_string();
+    return chatcmplid.str();
 }
 
-std::string format_chatml(std::vector<json> messages) {
-
-  std::ostringstream chatml_msgs;
+std::string format_chatml(std::vector<json> messages)
+{
+    std::ostringstream chatml_msgs;
 
-  // iterate the array
-  for (auto it = messages.begin(); it != messages.end(); ++it) {
-    chatml_msgs << "<|im_start|>"
-                << json_value(*it, "role", std::string("user")) << '\n';
-    chatml_msgs << json_value(*it, "content", std::string(""))
-                << "<|im_end|>\n";
-  }
+    for (auto it = messages.begin(); it != messages.end(); ++it) {
+        chatml_msgs << "<|im_start|>"
+                    << json_value(*it, "role", std::string("user")) << '\n';
+        chatml_msgs << json_value(*it, "content", std::string(""))
+                    << "<|im_end|>\n";
+    }
 
-  chatml_msgs << "<|im_start|>assistant" << '\n';
+    chatml_msgs << "<|im_start|>assistant" << '\n';
 
-  return chatml_msgs.str();
+    return chatml_msgs.str();
 }
 
 /* llama.cpp completion api semantics */
 json oaicompat_completion_params_parse(
-    const json &body /* openai api json semantics */) {
-  json llama_params;
-
-  llama_params["__oaicompat"] = true;
-
-  // Map OpenAI parameters to llama.cpp parameters
-  llama_params["prompt"] = format_chatml(
-      body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
-  llama_params["temperature"] =
-      json_value(body, "temperature", 0.8); // Default to 0.8 if not provided
-  llama_params["top_k"] =
-      json_value(body, "max_tokens", 40); // Default to 40 if not provided
-  llama_params["top_p"] =
-      json_value(body, "top_p", 0.95); // Default to 0.95 if not provided
-  llama_params["n_predict"] =
-      json_value(body, "max_tokens", -1); // Default to -1 if not provided
-  llama_params["logit_bias"] = json_value(
-      body, "logit_bias",
-      json::object()); // Default to empty object if not provided
-  llama_params["frequency_penalty"] = json_value(
-      body, "frequency_penalty", 0.0); // Default to 0.0 if not provided
-  llama_params["presence_penalty"] = json_value(
-      body, "presence_penalty", 0.0); // Default to 0.0 if not provided
-  llama_params["seed"] = json_value(body, "seed", 0);
-  llama_params["stream"] =
-      json_value(body, "stream", false); // Default to 0 if not provided
-  llama_params["mirostat"] =
-      json_value(body, "mirostat", false); // Default to false if not provided
-  llama_params["mirostat_tau"] =
-      json_value(body, "mirostat_tau", 0.0); // Default to 0.0 if not provided
-  llama_params["mirostat_eta"] =
-      json_value(body, "mirostat_eta", 0.0); // Default to 0.0 if not provided
-  llama_params["penalize_nl"] = json_value(
-      body, "penalize_nl", false); // Default to false if not provided
-  llama_params["typical_p"] =
-      json_value(body, "typical_p", 0.0); // Default to 0.0 if not provided
-  llama_params["repeat_last_n"] =
-      json_value(body, "repeat_last_n", 0); // Default to 0 if not provided
-  llama_params["ignore_eos"] =
-      json_value(body, "ignore_eos", false); // Default to false if not provided
-  llama_params["tfs_z"] =
-      json_value(body, "tfs_z", 0.0); // Default to 0.0 if not provided
-  if (llama_params.count("grammar") != 0) {
-    llama_params["grammar"] = json_value(
-        body, "grammar",
-        json::object()); // Default to empty object if not provided
-  }
-
-  // Handle 'stop' field
-  if (body["stop"].is_null()) {
-    llama_params["stop"] = json::array({});
-  } else if (body["stop"].is_string()) {
-    llama_params["stop"] = json::array({body["stop"].get<std::string>()});
-  } else {
-    llama_params["stop"] = json_value(
-        body, "stop",
-        json::array()); // Default to empty array if not provided
-  }
-
-  llama_params["stop"].push_back("<|im_end|>");
-
-  return llama_params;
+    const json &body /* openai api json semantics */)
+{
+    json llama_params;
+
+    llama_params["__oaicompat"] = true;
+
+    // Map OpenAI parameters to llama.cpp parameters
+    llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
+    llama_params["temperature"] = json_value(body, "temperature", 0.8);
+    llama_params["top_k"] = json_value(body, "max_tokens", 40);
+    llama_params["top_p"] = json_value(body, "top_p", 0.95);
+    llama_params["n_predict"] = json_value(body, "max_tokens", -1);
+    llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
+    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
+    llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
+    llama_params["seed"] = json_value(body, "seed", 0);
+    llama_params["stream"] =json_value(body, "stream", false);
+    llama_params["mirostat"] = json_value(body, "mirostat", false);
+    llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0);
+    llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0);
+    llama_params["penalize_nl"] = json_value(body, "penalize_nl", false);
+    llama_params["typical_p"] = json_value(body, "typical_p", 0.0);
+    llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0);
+    llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
+    llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
+    
+    if (llama_params.count("grammar") != 0) {
+        llama_params["grammar"] = json_value(
+            body, "grammar",
+            json::object());
+    }
+
+    // Handle 'stop' field
+    if (body["stop"].is_null()) {
+        llama_params["stop"] = json::array({});
+    } else if (body["stop"].is_string()) {
+        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
+    } else {
+        llama_params["stop"] = json_value(
+            body, "stop",
+            json::array());
+    }
+    
+    // Ensure there is ChatML-specific end sequence among stop words
+    llama_params["stop"].push_back("<|im_end|>");
+
+    return llama_params;
 }
 
 static json format_final_response_oaicompat(json request, task_result response,
-                                            bool streaming = false) {
+                                            bool streaming = false)
+{
+    json result = response.result_json;
+
+    bool stopped_word = result.count("stopped_word") != 0;
+    bool stopped_eos = json_value(result, "stopped_eos", false);
+    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+    int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
+    std::string content = json_value(result, "content", std::string(""));
 
-  json result = response.result_json;
+    std::string finish_reason = "length";
+    if (stopped_word || stopped_eos) {
+        finish_reason = "stop";
+    }
 
-  bool stopped_word = result.count("stopped_word") != 0;
-  bool stopped_eos = json_value(result, "stopped_eos", false);
-  int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
-  int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
-  std::string content = json_value(result, "content", std::string(""));
+    json choices =
+        streaming ? json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"delta", json::object()}}})
+                    : json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"message", json{{"content", content},
+                                                        {"role", "assistant"}}}}});
+
+    std::time_t t = std::time(0);
+
+    json res =
+        json{{"choices", choices},
+            {"created", t},
+            {"model",
+                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
+            {"usage",
+                json{{"completion_tokens", num_tokens_predicted},
+                    {"prompt_tokens", num_prompt_tokens},
+                    {"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
+            {"id", gen_chatcmplid()}};
+
+    if (server_verbose) {
+        res["__verbose"] = result;
+    }
 
-  std::string finish_reason = "length";
-  if (stopped_word || stopped_eos) {
-    finish_reason = "stop";
-  }
+    if (result.contains("completion_probabilities")) {
+        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
+    }
 
-  json choices =
-      streaming ? json::array({json{{"finish_reason", finish_reason},
-                                    {"index", 0},
-                                    {"delta", json::object()}}})
-                : json::array({json{{"finish_reason", finish_reason},
-                                    {"index", 0},
-                                    {"message", json{{"content", content},
-                                                     {"role", "assistant"}}}}});
-
-  std::time_t t = std::time(0);
-
-  json res =
-      json{{"choices", choices},
-           {"created", t},
-           {"model",
-            json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-           {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
-           {"usage",
-            json{{"completion_tokens", num_tokens_predicted},
-                 {"prompt_tokens", num_prompt_tokens},
-                 {"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
-           {"id", gen_chatcmplid()}};
-
-  if (server_verbose) {
-    res["__verbose"] = result;
-  }
-
-  if (result.contains("completion_probabilities")) {
-    res["completion_probabilities"] =
-        json_value(result, "completion_probabilities", json::array());
-  }
-
-  return res;
+    return res;
 }
 
+// return value is vector as there is one case where we might need to generate two responses
 static std::vector<json> format_partial_response_oaicompat(task_result response) {
-  json result = response.result_json;
-
-  if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
-    return std::vector<json>({response.result_json});
-  }
-
-  bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
-  std::string modelname =
-      json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-
-  bool stopped_word = json_value(result, "stopped_word", false);
-  bool stopped_eos = json_value(result, "stopped_eos", false);
-  bool stopped_limit = json_value(result, "stopped_limit", false);
-  std::string content = json_value(result, "content", std::string(""));
-
-  std::string finish_reason = "";
-  if (stopped_word || stopped_eos) {
-    finish_reason = "stop";
-  }
-  if (stopped_limit) {
-    finish_reason = "length";
-  }
-
-  std::time_t t = std::time(0);
-
-  json choices;
-
-  if (!finish_reason.empty()) {
-    choices = json::array({json{{"finish_reason", finish_reason},
-                                {"index", 0},
-                                {"delta", json::object()}}});
-  } else {
-    if (first) {
-      if (content.empty()) {
-        choices = json::array({json{{"finish_reason", nullptr},
+    json result = response.result_json;
+
+    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
+        return std::vector<json>({response.result_json});
+    }
+
+    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
+    std::string modelname =
+        json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+
+    bool stopped_word = json_value(result, "stopped_word", false);
+    bool stopped_eos = json_value(result, "stopped_eos", false);
+    bool stopped_limit = json_value(result, "stopped_limit", false);
+    std::string content = json_value(result, "content", std::string(""));
+
+    std::string finish_reason = "";
+    if (stopped_word || stopped_eos) {
+        finish_reason = "stop";
+    }
+    if (stopped_limit) {
+        finish_reason = "length";
+    }
+
+    std::time_t t = std::time(0);
+
+    json choices;
+
+    if (!finish_reason.empty()) {
+        choices = json::array({json{{"finish_reason", finish_reason},
                                     {"index", 0},
-                                    {"delta", json{{"role", "assistant"}}}}});
-      } else { 
-        // We have to send this as two updates to conform to openai behavior
-        json initial_ret = json{{"choices",
-                            json::array({json{
-                                {"finish_reason", nullptr},
-                                {"index", 0},
-                                {"delta", json{
-                                    {"role", "assistant"}
-                                }}}})},
-                  {"created", t},
-                  {"id", gen_chatcmplid()},
-                  {"model", modelname},
-                  {"object", "chat.completion.chunk"}};
-
-        json second_ret = json{{"choices",
-                            json::array({json{
-                                {"finish_reason", nullptr},
-                                {"index", 0},
-                                {"delta", json{
-                                    {"content", content}}}}})},
-                  {"created", t},
-                  {"id", gen_chatcmplid()},
-                  {"model", modelname},
-                  {"object", "chat.completion.chunk"}};
-        return std::vector<json>({initial_ret, second_ret});
-      }
+                                    {"delta", json::object()}}});
     } else {
-      // Some idosyncrasy in task processing logic makes several trailing calls
-      // with empty content, we ignore these at the calee site.
-      if (content.empty()) {
-        return std::vector<json>({json::object()});
-      }
-      choices = json::array({json{
-          {"finish_reason", nullptr},
-          {"index", 0},
-          {"delta",
-           json{
-               {"content", content},
-           }},
-      }});
+        if (first) {
+            if (content.empty()) {
+                choices = json::array({json{{"finish_reason", nullptr},
+                                            {"index", 0},
+                                            {"delta", json{{"role", "assistant"}}}}});
+            } else { 
+                // We have to send this as two updates to conform to openai behavior
+                json initial_ret = json{{"choices", json::array({json{
+                                        {"finish_reason", nullptr},
+                                        {"index", 0},
+                                        {"delta", json{
+                                            {"role", "assistant"}
+                                        }}}})},
+                            {"created", t},
+                            {"id", gen_chatcmplid()},
+                            {"model", modelname},
+                            {"object", "chat.completion.chunk"}};
+
+                json second_ret = json{
+                            {"choices", json::array({json{{"finish_reason", nullptr},
+                                                            {"index", 0},
+                                                            {"delta", json{
+                                                            {"content", content}}} 
+                                                            }})},
+                            {"created", t},
+                            {"id", gen_chatcmplid()},
+                            {"model", modelname},
+                            {"object", "chat.completion.chunk"}};
+                
+                return std::vector<json>({initial_ret, second_ret});
+            }
+        } else {
+            // Some idiosyncrasy in task processing logic makes several trailing calls
+            // with empty content, we ignore these at the calee site.
+            if (content.empty()) {
+                return std::vector<json>({json::object()});
+            }
+
+            choices = json::array({json{
+                {"finish_reason", nullptr},
+                {"index", 0},
+                {"delta",
+                json{
+                    {"content", content},
+                }},
+            }});
+        }
     }
-  }
 
-  json ret = json{{"choices", choices},
-                  {"created", t},
-                  {"id", gen_chatcmplid()},
-                  {"model", modelname},
-                  {"object", "chat.completion.chunk"}};
-    
-  return std::vector<json>({ret});
+    json ret = json{{"choices", choices},
+                    {"created", t},
+                    {"id", gen_chatcmplid()},
+                    {"model", modelname},
+                    {"object", "chat.completion.chunk"}};
+
+    return std::vector<json>({ret});
 }
 
 static json format_partial_response(
@@ -2670,76 +2655,76 @@ int main(int argc, char **argv)
             });
 
 
-  svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req,
-                                            httplib::Response &res) {
-    json data = oaicompat_completion_params_parse(json::parse(req.body));
+    svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req,
+                                            httplib::Response &res)
+            {
+                json data = oaicompat_completion_params_parse(json::parse(req.body));
 
-    const int task_id = llama.request_completion(data, false, false);
-    if (!json_value(data, "stream", false)) {
-      std::string completion_text;
-      task_result result = llama.next_result(task_id);
+                const int task_id = llama.request_completion(data, false, false);
+                
+                if (!json_value(data, "stream", false)) {
+                    std::string completion_text;
+                    task_result result = llama.next_result(task_id);
 
-      if (!result.error && result.stop) {
-        json oaicompat_result = format_final_response_oaicompat(data, result);
+                    if (!result.error && result.stop) {
+                        json oaicompat_result = format_final_response_oaicompat(data, result);
 
-        res.set_content(oaicompat_result.dump(-1, ' ', false,
-                                              json::error_handler_t::replace),
-                        "application/json");
-      } else {
-        res.status = 500;
-        res.set_content(result.result_json["content"], "text/plain");
-        return;
-      }
-    } else {
-      const auto chunked_content_provider = [task_id, &llama](size_t,
-                                                    httplib::DataSink &sink) {
-        while (true) {
-          task_result llama_result = llama.next_result(task_id);
-          if (!llama_result.error) {
-            std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
-            
-            for (auto it = result_array.begin(); it != result_array.end(); ++it)
-            {
-                if (!it->empty()) {
-                    const std::string str =
-                        "data: " +
-                        it->dump(-1, ' ', false, json::error_handler_t::replace) +
-                        "\n\n";
-                    LOG_VERBOSE("data stream", {{"to_send", str}});
-                    if (!sink.write(str.c_str(), str.size())) {
-                        return false;
+                        res.set_content(oaicompat_result.dump(-1, ' ', false,
+                                            json::error_handler_t::replace),
+                                            "application/json");
+                    } else {
+                        res.status = 500;
+                        res.set_content(result.result_json["content"], "text/plain");
+                        return;
                     }
-                }
-            }
-            if (llama_result.stop) {
-                break;
-            }
-          } else {
-            const std::string str =
-                "error: " +
-                llama_result.result_json.dump(-1, ' ', false,
-                                              json::error_handler_t::replace) +
-                "\n\n";
-            LOG_VERBOSE("data stream", {{"to_send", str}});
-            if (!sink.write(str.c_str(), str.size())) {
-              return false;
-            }
-            break;
-          }
-        }
-        sink.done();
-        return true;
-      };
+                } else {
+                    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
+                        while (true) {
+                        task_result llama_result = llama.next_result(task_id);
+                        if (!llama_result.error) {
+                            std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
+                            
+                            for (auto it = result_array.begin(); it != result_array.end(); ++it)
+                            {
+                                if (!it->empty()) {
+                                    const std::string str =
+                                        "data: " +
+                                        it->dump(-1, ' ', false, json::error_handler_t::replace) +
+                                        "\n\n";
+                                    LOG_VERBOSE("data stream", {{"to_send", str}});
+                                    if (!sink.write(str.c_str(), str.size())) {
+                                        return false;
+                                    }
+                                }
+                            }
+                            if (llama_result.stop) {
+                                break;
+                            }
+                        } else {
+                            const std::string str =
+                                "error: " +
+                                llama_result.result_json.dump(-1, ' ', false,
+                                                            json::error_handler_t::replace) +
+                                "\n\n";
+                            LOG_VERBOSE("data stream", {{"to_send", str}});
+                            if (!sink.write(str.c_str(), str.size())) {
+                                return false;
+                            }
+                            break;
+                        }
+                        }
+                        sink.done();
+                        return true;
+                    };
 
-      auto on_complete = [task_id, &llama](bool) {
-        // cancel
-        llama.request_cancel(task_id);
-      };
+                    auto on_complete = [task_id, &llama](bool) {
+                        // cancel request
+                        llama.request_cancel(task_id);
+                    };
 
-      res.set_chunked_content_provider("text/event-stream",
-                                       chunked_content_provider, on_complete);
-    }
-  });
+                    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
+                }
+            });
     
     svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
             {

From f25308be5c0659508df712c37ff87725d1a208f7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Nov 2023 10:49:08 +0200
Subject: [PATCH 06/11] server : some style changes

---
 examples/server/server.cpp | 110 ++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 57 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 54455ad9a7367..9c28c6aef2ec5 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -383,9 +383,9 @@ struct llama_client_slot
     bool stopped_eos = false;
     bool stopped_word = false;
     bool stopped_limit = false;
-    
+
     bool oaicompat = false;
-    std::string oaicompat_model = "";
+    std::string oaicompat_model;
 
     std::string stopping_word;
 
@@ -486,7 +486,7 @@ struct llama_client_slot
         };
     }
 
-    void print_timings() {
+    void print_timings() const {
         LOG_TEE("\n");
         LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
@@ -685,7 +685,7 @@ struct llama_server_context
     bool launch_slot_with_data(llama_client_slot* &slot, json data) {
         slot_params default_params;
         llama_sampling_params default_sparams;
-        
+
         if (data.count("__oaicompat") != 0) {
             slot->oaicompat = true;
             slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
@@ -693,7 +693,7 @@ struct llama_server_context
             slot->oaicompat = false;
             slot->oaicompat_model = "";
         }
-        
+
         slot->params.stream           = json_value(data, "stream",            false);
         slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
         slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
@@ -1284,7 +1284,7 @@ struct llama_server_context
         std::lock_guard<std::mutex> lock(mutex_tasks);
         task_server task;
         task.id = id_gen++;
-        task.data = data;
+        task.data = std::move(data);
         task.infill_mode = infill;
         task.embedding_mode = embedding;
         task.type = COMPLETION_TASK;
@@ -2252,29 +2252,27 @@ json oaicompat_completion_params_parse(
     llama_params["__oaicompat"] = true;
 
     // Map OpenAI parameters to llama.cpp parameters
-    llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
-    llama_params["temperature"] = json_value(body, "temperature", 0.8);
-    llama_params["top_k"] = json_value(body, "max_tokens", 40);
-    llama_params["top_p"] = json_value(body, "top_p", 0.95);
-    llama_params["n_predict"] = json_value(body, "max_tokens", -1);
-    llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
+    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
+    llama_params["temperature"]       = json_value(body, "temperature", 0.8);
+    llama_params["top_k"]             = json_value(body, "top_k", 40);
+    llama_params["top_p"]             = json_value(body, "top_p", 0.95);
+    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
+    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
     llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
-    llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
-    llama_params["seed"] = json_value(body, "seed", 0);
-    llama_params["stream"] =json_value(body, "stream", false);
-    llama_params["mirostat"] = json_value(body, "mirostat", false);
-    llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0);
-    llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0);
-    llama_params["penalize_nl"] = json_value(body, "penalize_nl", false);
-    llama_params["typical_p"] = json_value(body, "typical_p", 0.0);
-    llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0);
-    llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
-    llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
-    
+    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
+    llama_params["seed"]              = json_value(body, "seed", 0);
+    llama_params["stream"]            = json_value(body, "stream", false);
+    llama_params["mirostat"]          = json_value(body, "mirostat", false);
+    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", 0.0);
+    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", 0.0);
+    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", false);
+    llama_params["typical_p"]         = json_value(body, "typical_p", 0.0);
+    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", 0);
+    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
+    llama_params["tfs_z"]             = json_value(body, "tfs_z", 0.0);
+
     if (llama_params.count("grammar") != 0) {
-        llama_params["grammar"] = json_value(
-            body, "grammar",
-            json::object());
+        llama_params["grammar"] = json_value(body, "grammar", json::object());
     }
 
     // Handle 'stop' field
@@ -2287,23 +2285,22 @@ json oaicompat_completion_params_parse(
             body, "stop",
             json::array());
     }
-    
+
     // Ensure there is ChatML-specific end sequence among stop words
     llama_params["stop"].push_back("<|im_end|>");
 
     return llama_params;
 }
 
-static json format_final_response_oaicompat(json request, task_result response,
-                                            bool streaming = false)
+static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
 {
     json result = response.result_json;
 
-    bool stopped_word = result.count("stopped_word") != 0;
-    bool stopped_eos = json_value(result, "stopped_eos", false);
+    bool stopped_word        = result.count("stopped_word") != 0;
+    bool stopped_eos         = json_value(result, "stopped_eos", false);
     int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
-    int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
-    std::string content = json_value(result, "content", std::string(""));
+    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
+    std::string content      = json_value(result, "content", std::string(""));
 
     std::string finish_reason = "length";
     if (stopped_word || stopped_eos) {
@@ -2314,10 +2311,10 @@ static json format_final_response_oaicompat(json request, task_result response,
         streaming ? json::array({json{{"finish_reason", finish_reason},
                                         {"index", 0},
                                         {"delta", json::object()}}})
-                    : json::array({json{{"finish_reason", finish_reason},
+                  : json::array({json{{"finish_reason", finish_reason},
                                         {"index", 0},
                                         {"message", json{{"content", content},
-                                                        {"role", "assistant"}}}}});
+                                                         {"role", "assistant"}}}}});
 
     std::time_t t = std::time(0);
 
@@ -2345,7 +2342,7 @@ static json format_final_response_oaicompat(json request, task_result response,
 }
 
 // return value is vector as there is one case where we might need to generate two responses
-static std::vector<json> format_partial_response_oaicompat(task_result response) {
+static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
     json result = response.result_json;
 
     if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
@@ -2353,15 +2350,14 @@ static std::vector<json> format_partial_response_oaicompat(task_result response)
     }
 
     bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
-    std::string modelname =
-        json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
 
-    bool stopped_word = json_value(result, "stopped_word", false);
-    bool stopped_eos = json_value(result, "stopped_eos", false);
-    bool stopped_limit = json_value(result, "stopped_limit", false);
+    bool stopped_word   = json_value(result, "stopped_word", false);
+    bool stopped_eos    = json_value(result, "stopped_eos", false);
+    bool stopped_limit  = json_value(result, "stopped_limit", false);
     std::string content = json_value(result, "content", std::string(""));
 
-    std::string finish_reason = "";
+    std::string finish_reason;
     if (stopped_word || stopped_eos) {
         finish_reason = "stop";
     }
@@ -2383,7 +2379,7 @@ static std::vector<json> format_partial_response_oaicompat(task_result response)
                 choices = json::array({json{{"finish_reason", nullptr},
                                             {"index", 0},
                                             {"delta", json{{"role", "assistant"}}}}});
-            } else { 
+            } else {
                 // We have to send this as two updates to conform to openai behavior
                 json initial_ret = json{{"choices", json::array({json{
                                         {"finish_reason", nullptr},
@@ -2400,13 +2396,13 @@ static std::vector<json> format_partial_response_oaicompat(task_result response)
                             {"choices", json::array({json{{"finish_reason", nullptr},
                                                             {"index", 0},
                                                             {"delta", json{
-                                                            {"content", content}}} 
+                                                            {"content", content}}}
                                                             }})},
                             {"created", t},
                             {"id", gen_chatcmplid()},
                             {"model", modelname},
                             {"object", "chat.completion.chunk"}};
-                
+
                 return std::vector<json>({initial_ret, second_ret});
             }
         } else {
@@ -2612,9 +2608,9 @@ int main(int argc, char **argv)
                             task_result result = llama.next_result(task_id);
                             if (!result.error) {
                                 const std::string str =
-                                "data: " +
-                                result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
-                                "\n\n";
+                                    "data: " +
+                                    result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
+                                    "\n\n";
                                 LOG_VERBOSE("data stream", {
                                     { "to_send", str }
                                 });
@@ -2627,9 +2623,9 @@ int main(int argc, char **argv)
                                 }
                             } else {
                                 const std::string str =
-                                "error: " +
-                                result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
-                                "\n\n";
+                                    "error: " +
+                                    result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
+                                    "\n\n";
                                 LOG_VERBOSE("data stream", {
                                     { "to_send", str }
                                 });
@@ -2655,13 +2651,13 @@ int main(int argc, char **argv)
             });
 
 
-    svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req,
-                                            httplib::Response &res)
+    // TODO: add mount point without "/v1" prefix -- how?
+    svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res)
             {
                 json data = oaicompat_completion_params_parse(json::parse(req.body));
 
                 const int task_id = llama.request_completion(data, false, false);
-                
+
                 if (!json_value(data, "stream", false)) {
                     std::string completion_text;
                     task_result result = llama.next_result(task_id);
@@ -2683,7 +2679,7 @@ int main(int argc, char **argv)
                         task_result llama_result = llama.next_result(task_id);
                         if (!llama_result.error) {
                             std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
-                            
+
                             for (auto it = result_array.begin(); it != result_array.end(); ++it)
                             {
                                 if (!it->empty()) {
@@ -2725,7 +2721,7 @@ int main(int argc, char **argv)
                     res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
                 }
             });
-    
+
     svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
             {
                 json data = json::parse(req.body);

From b94b10914cb950d2be00dd0de4003eb980722b30 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Nov 2023 11:00:15 +0200
Subject: [PATCH 07/11] server : indentation

---
 examples/server/server.cpp | 54 +++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index db53b3b84e6b9..2fe2678fd3bf9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2678,38 +2678,38 @@ int main(int argc, char **argv)
                 } else {
                     const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
                         while (true) {
-                        task_result llama_result = llama.next_result(task_id);
-                        if (!llama_result.error) {
-                            std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
+                            task_result llama_result = llama.next_result(task_id);
+                            if (!llama_result.error) {
+                                std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
 
-                            for (auto it = result_array.begin(); it != result_array.end(); ++it)
-                            {
-                                if (!it->empty()) {
-                                    const std::string str =
-                                        "data: " +
-                                        it->dump(-1, ' ', false, json::error_handler_t::replace) +
-                                        "\n\n";
-                                    LOG_VERBOSE("data stream", {{"to_send", str}});
-                                    if (!sink.write(str.c_str(), str.size())) {
-                                        return false;
+                                for (auto it = result_array.begin(); it != result_array.end(); ++it)
+                                {
+                                    if (!it->empty()) {
+                                        const std::string str =
+                                            "data: " +
+                                            it->dump(-1, ' ', false, json::error_handler_t::replace) +
+                                            "\n\n";
+                                        LOG_VERBOSE("data stream", {{"to_send", str}});
+                                        if (!sink.write(str.c_str(), str.size())) {
+                                            return false;
+                                        }
                                     }
                                 }
-                            }
-                            if (llama_result.stop) {
+                                if (llama_result.stop) {
+                                    break;
+                                }
+                            } else {
+                                const std::string str =
+                                    "error: " +
+                                    llama_result.result_json.dump(-1, ' ', false,
+                                            json::error_handler_t::replace) +
+                                    "\n\n";
+                                LOG_VERBOSE("data stream", {{"to_send", str}});
+                                if (!sink.write(str.c_str(), str.size())) {
+                                    return false;
+                                }
                                 break;
                             }
-                        } else {
-                            const std::string str =
-                                "error: " +
-                                llama_result.result_json.dump(-1, ' ', false,
-                                                            json::error_handler_t::replace) +
-                                "\n\n";
-                            LOG_VERBOSE("data stream", {{"to_send", str}});
-                            if (!sink.write(str.c_str(), str.size())) {
-                                return false;
-                            }
-                            break;
-                        }
                         }
                         sink.done();
                         return true;

From c544faed749240fe5eac2bc042087c71f79a0728 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Nov 2023 11:10:23 +0200
Subject: [PATCH 08/11] server : enable special tokens during tokenization by
 default

---
 examples/server/server.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2fe2678fd3bf9..7e4ad53b2dbf9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -618,6 +618,11 @@ struct llama_server_context
 
     std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
     {
+        // TODO: currently, we tokenize using special tokens by default
+        //       this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
+        //       but it's better compared to completely ignoring ChatML and other chat templates
+        const bool TMP_FORCE_SPECIAL = true;
+
         // If `add_bos` is true, we only add BOS, when json_prompt is a string,
         // or the first element of the json_prompt array is a string.
         std::vector<llama_token> prompt_tokens;
@@ -633,12 +638,12 @@ struct llama_server_context
                     std::vector<llama_token> p;
                     if (first)
                     {
-                        p = ::llama_tokenize(ctx, s, add_bos);
+                        p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
                         first = false;
                     }
                     else
                     {
-                        p = ::llama_tokenize(ctx, s, false);
+                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
                     }
                     prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
                 }
@@ -655,7 +660,7 @@ struct llama_server_context
         else
         {
             auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+            prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
         }
 
         return prompt_tokens;
@@ -2235,7 +2240,7 @@ std::string format_chatml(std::vector<json> messages)
 
     for (auto it = messages.begin(); it != messages.end(); ++it) {
         chatml_msgs << "<|im_start|>"
-                    << json_value(*it, "role", std::string("user")) << '\n';
+                    << json_value(*it, "role",    std::string("user")) << '\n';
         chatml_msgs << json_value(*it, "content", std::string(""))
                     << "<|im_end|>\n";
     }

From b3e88bf494c840770aef1870f329ab6d7fc92702 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Nov 2023 11:33:49 +0200
Subject: [PATCH 09/11] server : minor code style

---
 examples/server/server.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 7e4ad53b2dbf9..cb5caf3ffbc2c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2288,9 +2288,7 @@ json oaicompat_completion_params_parse(
     } else if (body["stop"].is_string()) {
         llama_params["stop"] = json::array({body["stop"].get<std::string>()});
     } else {
-        llama_params["stop"] = json_value(
-            body, "stop",
-            json::array());
+        llama_params["stop"] = json_value(body, "stop", json::array());
     }
 
     // Ensure there is ChatML-specific end sequence among stop words

From b61631426b6029bb9106695be6598c29e92d4c86 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Nov 2023 11:39:03 +0200
Subject: [PATCH 10/11] server : change random string generator

---
 examples/server/server.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index cb5caf3ffbc2c..b5f3f38d4af43 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2217,14 +2217,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 
 static std::string random_string()
 {
-    std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
 
     std::random_device rd;
     std::mt19937 generator(rd());
 
-    std::shuffle(str.begin(), str.end(), generator);
+    std::string result(32, ' ');
 
-    return str.substr(0, 32); // assumes 32 < number of characters in str
+    for (int i = 0; i < 32; ++i) {
+        result[i] = str[generator() % str.size()];
+    }
+
+    return result;
 }
 
 static std::string gen_chatcmplid()

From 21b70babf759f30b52cef6cb4ef1a206e56ca70e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobi=20L=C3=BCtke?= <tobi@Tobis-MacBook-Pro.local>
Date: Fri, 24 Nov 2023 11:22:39 -0500
Subject: [PATCH 11/11] straightforward /v1/models endpoint

---
 examples/server/server.cpp | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index b5f3f38d4af43..50f124b13e849 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2660,6 +2660,26 @@ int main(int argc, char **argv)
             });
 
 
+
+    svr.Get("/v1/models", [&params](const httplib::Request&, httplib::Response& res)
+            {
+                std::time_t t = std::time(0);
+
+                json models = {
+                    {"object", "list"},
+                    {"data", {
+                        {
+                            {"id", params.model_alias},
+                            {"object", "model"},
+                            {"created", t},
+                            {"owned_by", "llamacpp"}
+                        },
+                    }}
+                };
+
+                res.set_content(models.dump(), "application/json");
+            });
+
     // TODO: add mount point without "/v1" prefix -- how?
     svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res)
             {