ggml-org · ngxson · Jul 15, 2024 · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024
diff --git a/common/common.cpp b/common/common.cpp
@@ -2080,19 +2080,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
         float lora_scale = std::get<1>(params.lora_adapter[i]);
-        int err = llama_model_apply_lora_from_file(model,
-                                             lora_adapter.c_str(),
-                                             lora_scale,
-                                             ((i > 0) || params.lora_base.empty())
-                                                ? NULL
-                                                : params.lora_base.c_str(),
-                                             params.n_threads);
-        if (err != 0) {
+        auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
+        if (adapter == nullptr) {
             fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
             llama_free(lctx);
             llama_free_model(model);
             return std::make_tuple(nullptr, nullptr);
         }
+        llama_lora_adapter_set(lctx, adapter, lora_scale);
     }
 
     if (params.ignore_eos) {

diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from __future__ import annotations
+
+import logging
+import argparse
+import os
+import sys
+import types
+from pathlib import Path
+from typing import TYPE_CHECKING, Iterable, Iterator
+
+import torch
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf
+
+# reuse model definitions from convert_hf_to_gguf.py
+from convert_hf_to_gguf import Model
+
+logger = logging.getLogger("lora-to-gguf")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
+    parser.add_argument(
+        "--outfile", type=Path,
+        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
+    )
+    parser.add_argument(
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0",
+    )
+    parser.add_argument(
+        "--bigendian", action="store_true",
+        help="model is executed on big endian machine",
+    )
+    parser.add_argument(
+        "--verbose", action="store_true",
+        help="increase output verbosity",
+    )
+    parser.add_argument(
+        "--base", type=Path, required=True,
+        help="directory containing base model file",
+    )
+    parser.add_argument(
+        "lora_path", type=Path,
+        help="directory containing LoRA adapter file",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    ftype_map: dict[str, gguf.LlamaFileType] = {
+        "f32": gguf.LlamaFileType.ALL_F32,
+        "f16": gguf.LlamaFileType.MOSTLY_F16,
+        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
+        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
+    }
+    ftype = ftype_map[args.outtype]
+
+    dir_base_model = args.base
+    dir_lora = args.lora_path
+    input_json = os.path.join(dir_lora, "adapter_config.json")
+    input_model = os.path.join(dir_lora, "adapter_model.bin")
+    if args.outfile is not None:
+        fname_out = args.outfile
+    else:
+        # output in the same directory as the model by default
+        fname_out = dir_lora / 'ggml-lora-{ftype}.gguf'
+
+    if os.path.exists(input_model):
+        lora_model = torch.load(input_model, map_location="cpu")
+    else:
+        input_model = os.path.join(dir_lora, "adapter_model.safetensors")
+        # lazy import load_file only if lora is in safetensors format.
+        from safetensors.torch import load_file
+        lora_model = load_file(input_model, device="cpu")
+
+    # load base model
+    logger.info(f"Loading base model: {dir_base_model.name}")
+    hparams = Model.load_hparams(dir_base_model)
+    with torch.inference_mode():
+        try:
+            model_class = Model.from_model_architecture(hparams["architectures"][0])
+        except NotImplementedError:
+            logger.error(f"Model {hparams['architectures'][0]} is not supported")
+            sys.exit(1)
+
+        model_instance = model_class(dir_base_model, ftype, fname_out, args.bigendian, False, False, None)
+        logger.info("Set model parameters")
+        model_instance.set_gguf_parameters()
+
+        # adapter_config = json.load(input_json)
+        model_instance.gguf_writer.add_string("training.type", "finetune_lora")
+
+    map_tensors: dict[str, Tensor] = {}
+    for tensor_name, tensor in lora_model.items():
+        orig_name = tensor_name.replace("base_model.model.", "")
+        orig_name = orig_name.replace(".lora_A.weight", ".weight")
+        orig_name = orig_name.replace(".lora_B.weight", ".weight")
+        is_lora_a = ".lora_A.weight" in tensor_name
+        is_lora_b = ".lora_B.weight" in tensor_name
+        if not is_lora_a and not is_lora_b:
+            logger.error(f"Unexpected name '{tensor_name}': Not a lora_A or lora_B tensor")
+            sys.exit(1)
+        dest_name = model_instance.map_tensor_name(orig_name)
+        dest_name = f"{dest_name}.lora_a" if is_lora_a else f"{dest_name}.lora_b"
+        # logger.info(f"{orig_name} --> {dest_name}")
+        map_tensors[dest_name] = tensor
+
+    # overwrite method
+    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
+        for name, tensor in map_tensors.items():
+            yield (name, tensor)
+
+    # overwrite method
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+        # TODO: This will not take into account tensor transformations
+        return [(name, data_torch)]
+
+    # overwrite method
+    def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
+        del name, new_name, bid, n_dims  # unused
+        return ftype != gguf.LlamaFileType.ALL_F32
+
+    model_instance.get_tensors = types.MethodType(get_tensors, model_instance)
+    model_instance.modify_tensors = types.MethodType(modify_tensors, model_instance)
+    model_instance.extra_f16_tensors = types.MethodType(extra_f16_tensors, model_instance)
+
+    model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
+    logger.info("Exporting model...")
+    model_instance.write()
+    logger.info(f"Model successfully exported to {fname_out}")
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -19339,7 +19339,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
 
     fprintf(fp, "digraph G {\n");
     fprintf(fp, "  newrank = true;\n");
-    fprintf(fp, "  rankdir = LR;\n");
+    fprintf(fp, "  rankdir = TB;\n");
 
     for (int i = 0; i < gb->n_nodes; i++) {
         struct ggml_tensor * node = gb->nodes[i];
@@ -19401,7 +19401,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
         }
 
         fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
-        if (ggml_nelements(node) < 5) {
+        if (ggml_nelements(node) < 5 && node->data != NULL) {
             fprintf(fp, " | (");
             for (int j = 0; j < ggml_nelements(node); j++) {
                 if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {

diff --git a/include/llama.h b/include/llama.h
@@ -408,6 +408,9 @@ extern "C" {
         const char * content;
     } llama_chat_message;
 
+    // lora adapter
+    struct llama_lora_adapter;
+
     // Helpers for getting default parameters
     LLAMA_API struct llama_model_params llama_model_default_params(void);
     LLAMA_API struct llama_context_params llama_context_default_params(void);
@@ -507,18 +510,28 @@ extern "C" {
             const char * fname_out,
             const llama_model_quantize_params * params);
 
-    // Apply a LoRA adapter to a loaded model
-    // path_base_model is the path to a higher quality model to use as a base for
-    // the layers modified by the adapter. Can be NULL to use the current loaded model.
-    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
-    // will be applied on top of the previous one
-    // Returns 0 on success
-    LLAMA_API int32_t llama_model_apply_lora_from_file(
-            const struct llama_model * model,
-                          const char * path_lora,
-                               float   scale,
-                          const char * path_base_model,
-                             int32_t   n_threads);
+    // Load a LoRA adapter from file
+    // The loaded adapter will be associated to the given model, and will be free when the model is deleted
+    LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
+            struct llama_model * model,
+            const char * path_lora);
+
+    // Add a loaded LoRA adapter to given context
+    // This will not modify model's weight
+    LLAMA_API int32_t llama_lora_adapter_set(
+            struct llama_context * ctx,
+            struct llama_lora_adapter * adapter,
+            float scale);
+
+    // Remove a LoRA adapter from given context
+    // Return -1 if the adapter is not present in the context
+    LLAMA_API int32_t llama_lora_adapter_remove(
+            struct llama_context * ctx,
+            struct llama_lora_adapter * adapter);
+
+    // Manually free a LoRA adapter
+    // Note: loaded adapters will be free when the associated model is deleted
+    LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
 
     // Apply a loaded control vector to a llama_context, or if data is NULL, clear
     // the currently loaded vector.

diff --git a/requirements.txt b/requirements.txt
@@ -9,3 +9,4 @@
 -r ./requirements/requirements-convert_hf_to_gguf.txt
 -r ./requirements/requirements-convert_hf_to_gguf_update.txt
 -r ./requirements/requirements-convert_llama_ggml_to_gguf.txt
+-r ./requirements/requirements-convert_lora_to_gguf.txt
diff --git a/requirements/requirements-convert_lora_to_gguf.txt b/requirements/requirements-convert_lora_to_gguf.txt
@@ -0,0 +1,2 @@
+-r ./requirements-convert_hf_to_gguf.txt
+--extra-index-url https://download.pytorch.org/whl/cpu
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		-r ./requirements-convert_hf_to_gguf.txt
		--extra-index-url https://download.pytorch.org/whl/cpu