feat: sync whisper.cpp (#186)

mybigday · Jan 24, 2024 · 7109406 · 7109406
1 parent a51363b
commit 7109406
Show file tree

Hide file tree

Showing 22 changed files with 7,291 additions and 3,593 deletions.
diff --git a/cpp/coreml/whisper-encoder.mm b/cpp/coreml/whisper-encoder.mm
@@ -24,7 +24,7 @@
 
     // select which device to run the Core ML model on
     MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
-    //config.computeUnits = MLComputeUnitsCPUAndGPU;
+    // config.computeUnits = MLComputeUnitsCPUAndGPU;
     //config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
     config.computeUnits = MLComputeUnitsAll;
 

diff --git a/cpp/ggml-alloc.c b/cpp/ggml-alloc.c
@@ -72,7 +72,7 @@ static void remove_allocated_tensor(wsp_ggml_tallocr_t alloc, struct wsp_ggml_te
 
 // check if a tensor is allocated by this buffer
 static bool wsp_ggml_tallocr_is_own(wsp_ggml_tallocr_t alloc, const struct wsp_ggml_tensor * tensor) {
-    return tensor->buffer == alloc->buffer;
+    return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
 }
 
 static bool wsp_ggml_is_view(struct wsp_ggml_tensor * t) {
@@ -102,8 +102,6 @@ void wsp_ggml_tallocr_alloc(wsp_ggml_tallocr_t alloc, struct wsp_ggml_tensor * t
         }
     }
 
-    AT_PRINTF("block %d\n", best_fit_block);
-
     if (best_fit_block == -1) {
         // the last block is our last resort
         struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
@@ -117,6 +115,7 @@ void wsp_ggml_tallocr_alloc(wsp_ggml_tallocr_t alloc, struct wsp_ggml_tensor * t
             return;
         }
     }
+
     struct free_block * block = &alloc->free_blocks[best_fit_block];
     void * addr = block->addr;
     block->addr = (char*)block->addr + size;
@@ -129,6 +128,8 @@ void wsp_ggml_tallocr_alloc(wsp_ggml_tallocr_t alloc, struct wsp_ggml_tensor * t
         }
     }
 
+    AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
+
     tensor->data = addr;
     tensor->buffer = alloc->buffer;
     if (!alloc->measure) {
@@ -229,6 +230,7 @@ void wsp_ggml_tallocr_reset(wsp_ggml_tallocr_t alloc) {
         alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
     } else {
         alloc->free_blocks[0].size = wsp_ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
+        wsp_ggml_backend_buffer_reset(alloc->buffer);
     }
 }
 
@@ -263,9 +265,9 @@ wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure(size_t alignment) {
     return alloc;
 }
 
-wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure_from_backend(struct wsp_ggml_backend * backend) {
+wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure_from_buft(struct wsp_ggml_backend_buffer_type * buft) {
     // create a backend buffer to get the correct tensor allocation sizes
-    wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_alloc_buffer(backend, 1);
+    wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_buft_alloc_buffer(buft, 1);
 
     // TODO: move alloc initialization to a common wsp_ggml_tallocr_new_impl function
     wsp_ggml_tallocr_t alloc = wsp_ggml_tallocr_new_from_buffer(buffer);
@@ -275,13 +277,22 @@ wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure_from_backend(struct wsp_ggml_bac
     return alloc;
 }
 
-wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_backend(struct wsp_ggml_backend * backend, size_t size) {
-    wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_alloc_buffer(backend, size);
+wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure_from_backend(struct wsp_ggml_backend * backend) {
+    return wsp_ggml_tallocr_new_measure_from_buft(wsp_ggml_backend_get_default_buffer_type(backend));
+}
+
+wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_buft(struct wsp_ggml_backend_buffer_type * buft, size_t size) {
+    // create a backend buffer to get the correct tensor allocation sizes
+    wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_buft_alloc_buffer(buft, size);
     wsp_ggml_tallocr_t alloc = wsp_ggml_tallocr_new_from_buffer(buffer);
     alloc->buffer_owned = true;
     return alloc;
 }
 
+wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_backend(struct wsp_ggml_backend * backend, size_t size) {
+    return wsp_ggml_tallocr_new_from_buft(wsp_ggml_backend_get_default_buffer_type(backend), size);
+}
+
 wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_buffer(struct wsp_ggml_backend_buffer * buffer) {
     wsp_ggml_tallocr_t alloc = (wsp_ggml_tallocr_t)malloc(sizeof(struct wsp_ggml_tallocr));
 
@@ -449,11 +460,10 @@ static void init_view(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * view,
     if (update_backend) {
         view->backend = view->view_src->backend;
     }
-    view->buffer  = view->view_src->buffer;
+    // views are initialized in the alloc buffer rather than the view_src buffer
+    view->buffer  = alloc->buffer;
     view->data    = (char *)view->view_src->data + view->view_offs;
 
-    // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
-    // due to the wsp_ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
     assert(wsp_ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
 
     if (!alloc->measure) {
@@ -736,6 +746,10 @@ void wsp_ggml_allocr_set_parse_seq(wsp_ggml_allocr_t alloc, const int * list, in
 }
 
 void wsp_ggml_allocr_free(wsp_ggml_allocr_t alloc) {
+    if (alloc == NULL) {
+        return;
+    }
+
     wsp_ggml_gallocr_free(alloc->galloc);
     wsp_ggml_tallocr_free(alloc->talloc);
     free(alloc);
@@ -775,11 +789,22 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors_from_buft(struct ws
     }
 
     if (nbytes == 0) {
-        fprintf(stderr, "%s: no tensors to allocate\n", __func__);
+        // all the tensors in the context are already allocated
+#ifndef NDEBUG
+        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
+#endif
         return NULL;
     }
 
     wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_buft_alloc_buffer(buft, nbytes);
+    if (buffer == NULL) {
+        // failed to allocate buffer
+#ifndef NDEBUG
+        fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
+#endif
+        return NULL;
+    }
+
     wsp_ggml_tallocr_t tallocr = wsp_ggml_tallocr_new_from_buffer(buffer);
 
     for (struct wsp_ggml_tensor * t = wsp_ggml_get_first_tensor(ctx); t != NULL; t = wsp_ggml_get_next_tensor(ctx, t)) {
@@ -789,6 +814,11 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors_from_buft(struct ws
             } else {
                 wsp_ggml_backend_view_init(buffer, t);
             }
+        } else {
+            if (t->view_src != NULL) {
+                // view of a pre-allocated tensor
+                wsp_ggml_backend_view_init(buffer, t);
+            }
         }
     }
 

diff --git a/cpp/ggml-alloc.h b/cpp/ggml-alloc.h
@@ -52,8 +52,10 @@ typedef struct wsp_ggml_tallocr * wsp_ggml_tallocr_t;
 
 WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new(void * data, size_t size, size_t alignment);
 WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure(size_t alignment);
-WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_buffer(struct wsp_ggml_backend_buffer * buffer);
+WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_buft(struct wsp_ggml_backend_buffer_type * buft, size_t size);
 WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_backend(struct wsp_ggml_backend * backend, size_t size); // allocates an owned buffer
+WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_buffer(struct wsp_ggml_backend_buffer * buffer);
+WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure_from_buft(struct wsp_ggml_backend_buffer_type * buft);
 WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure_from_backend(struct wsp_ggml_backend * backend);
 
 WSP_GGML_API struct wsp_ggml_backend_buffer * wsp_ggml_tallocr_get_buffer(wsp_ggml_tallocr_t talloc);

diff --git a/cpp/ggml-backend-impl.h b/cpp/ggml-backend-impl.h
@@ -16,10 +16,14 @@ extern "C" {
     typedef void * wsp_ggml_backend_buffer_type_context_t;
 
     struct wsp_ggml_backend_buffer_type_i {
-        wsp_ggml_backend_buffer_t (*alloc_buffer)    (wsp_ggml_backend_buffer_type_t buft, size_t size);
-        size_t                (*get_alignment)   (wsp_ggml_backend_buffer_type_t buft); // tensor alignment
-        size_t                (*get_alloc_size)  (wsp_ggml_backend_buffer_type_t buft, struct wsp_ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
-        bool                  (*supports_backend)(wsp_ggml_backend_buffer_type_t buft, wsp_ggml_backend_t backend); // check if the buffer type is usable by the backend
+        const char *          (*WSP_GGML_CALL get_name)        (wsp_ggml_backend_buffer_type_t buft);
+        wsp_ggml_backend_buffer_t (*WSP_GGML_CALL alloc_buffer)    (wsp_ggml_backend_buffer_type_t buft, size_t size);
+        size_t                (*WSP_GGML_CALL get_alignment)   (wsp_ggml_backend_buffer_type_t buft); // tensor alignment
+        size_t                (*WSP_GGML_CALL get_alloc_size)  (wsp_ggml_backend_buffer_type_t buft, const struct wsp_ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
+        bool                  (*WSP_GGML_CALL supports_backend)(wsp_ggml_backend_buffer_type_t buft, wsp_ggml_backend_t backend); // check if the buffer type is usable by the backend
+        // check if tensor data is in host memory
+        // should be equivalent to supports_backend(buft, wsp_ggml_backend_cpu_init())
+        bool                  (*WSP_GGML_CALL is_host)         (wsp_ggml_backend_buffer_type_t buft);
     };
 
     struct wsp_ggml_backend_buffer_type {
@@ -31,30 +35,33 @@ extern "C" {
     typedef void * wsp_ggml_backend_buffer_context_t;
 
     struct wsp_ggml_backend_buffer_i {
-        void     (*free_buffer)(wsp_ggml_backend_buffer_t buffer);
-        //void     (*reset)      (wsp_ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
-        void *   (*get_base)   (wsp_ggml_backend_buffer_t buffer);
-        void     (*init_tensor)(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor);
-        void     (*set_tensor) (wsp_ggml_backend_buffer_t buffer,       struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void     (*get_tensor) (wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
-        void (*cpy_tensor_from)(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
-        void (*cpy_tensor_to)  (wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
+        const char * (*WSP_GGML_CALL get_name)   (wsp_ggml_backend_buffer_t buffer);
+        void         (*WSP_GGML_CALL free_buffer)(wsp_ggml_backend_buffer_t buffer);
+        void *       (*WSP_GGML_CALL get_base)   (wsp_ggml_backend_buffer_t buffer);
+        void         (*WSP_GGML_CALL init_tensor)(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor);
+        void         (*WSP_GGML_CALL set_tensor) (wsp_ggml_backend_buffer_t buffer,       struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void         (*WSP_GGML_CALL get_tensor) (wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        bool         (*WSP_GGML_CALL cpy_tensor) (wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
+        void         (*WSP_GGML_CALL clear)      (wsp_ggml_backend_buffer_t buffer, uint8_t value);
+        void         (*WSP_GGML_CALL reset)      (wsp_ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
     };
 
     struct wsp_ggml_backend_buffer {
         struct wsp_ggml_backend_buffer_i  iface;
         wsp_ggml_backend_buffer_type_t    buft;
         wsp_ggml_backend_buffer_context_t context;
         size_t size;
+        enum wsp_ggml_backend_buffer_usage usage;
     };
 
-    wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
+    WSP_GGML_CALL wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
                    wsp_ggml_backend_buffer_type_t      buft,
             struct wsp_ggml_backend_buffer_i           iface,
                    wsp_ggml_backend_buffer_context_t   context,
                    size_t                          size);
 
+    // do not use directly, use wsp_ggml_backend_tensor_copy instead
+    bool wsp_ggml_backend_buffer_copy_tensor(const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
 
     //
     // Backend
@@ -63,33 +70,31 @@ extern "C" {
     typedef void * wsp_ggml_backend_context_t;
 
     struct wsp_ggml_backend_i {
-        const char * (*get_name)(wsp_ggml_backend_t backend);
+        const char * (*WSP_GGML_CALL get_name)(wsp_ggml_backend_t backend);
 
-        void (*free)(wsp_ggml_backend_t backend);
+        void (*WSP_GGML_CALL free)(wsp_ggml_backend_t backend);
 
         // buffer allocation
-        wsp_ggml_backend_buffer_type_t (*get_default_buffer_type)(wsp_ggml_backend_t backend);
+        wsp_ggml_backend_buffer_type_t (*WSP_GGML_CALL get_default_buffer_type)(wsp_ggml_backend_t backend);
 
-        // (optional) asynchroneous tensor data access
-        void (*set_tensor_async)(wsp_ggml_backend_t backend,       struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*get_tensor_async)(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        // (optional) asynchronous tensor data access
+        void (*WSP_GGML_CALL set_tensor_async)(wsp_ggml_backend_t backend,       struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*WSP_GGML_CALL get_tensor_async)(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        bool (*WSP_GGML_CALL cpy_tensor_async)(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
 
-        // (optional) asynchroneous tensor copy
-        void (*cpy_tensor_from_async)(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
-        void (*cpy_tensor_to_async)  (wsp_ggml_backend_t backend, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
-
-        void (*synchronize)     (wsp_ggml_backend_t backend);
+        // (optional) complete all pending operations
+        void (*WSP_GGML_CALL synchronize)(wsp_ggml_backend_t backend);
 
         // compute graph with a plan
-        wsp_ggml_backend_graph_plan_t (*graph_plan_create) (wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph);
-        void                      (*graph_plan_free)   (wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan);
-        void                      (*graph_plan_compute)(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan);
+        wsp_ggml_backend_graph_plan_t (*WSP_GGML_CALL graph_plan_create) (wsp_ggml_backend_t backend, const struct wsp_ggml_cgraph * cgraph);
+        void                      (*WSP_GGML_CALL graph_plan_free)   (wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan);
+        void                      (*WSP_GGML_CALL graph_plan_compute)(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan);
 
-        // compute graph without a plan
-        void (*graph_compute)(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph);
+        // compute graph without a plan (async)
+        bool (*WSP_GGML_CALL graph_compute)(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph);
 
         // check if the backend supports an operation
-        bool (*supports_op)(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op);
+        bool (*WSP_GGML_CALL supports_op)(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op);
     };
 
     struct wsp_ggml_backend {
@@ -98,14 +103,13 @@ extern "C" {
         wsp_ggml_backend_context_t context;
     };
 
-
     //
     // Backend registry
     //
 
-    typedef wsp_ggml_backend_t (*wsp_ggml_backend_init_fn)(const char * params, void * user_data);
+    typedef wsp_ggml_backend_t (*WSP_GGML_CALL wsp_ggml_backend_init_fn)(const char * params, void * user_data);
 
-    void wsp_ggml_backend_register(const char * name, wsp_ggml_backend_init_fn init_fn, wsp_ggml_backend_buffer_type_t default_buffer_type, void * user_data);
+    WSP_GGML_CALL void wsp_ggml_backend_register(const char * name, wsp_ggml_backend_init_fn init_fn, wsp_ggml_backend_buffer_type_t default_buffer_type, void * user_data);
 
 #ifdef  __cplusplus
 }