Skip to content

Commit

Permalink
feat: sync whisper.cpp (#186)
Browse files Browse the repository at this point in the history
  • Loading branch information
jhen0409 authored Jan 24, 2024
1 parent a51363b commit 7109406
Show file tree
Hide file tree
Showing 22 changed files with 7,291 additions and 3,593 deletions.
2 changes: 1 addition & 1 deletion cpp/coreml/whisper-encoder.mm
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

// select which device to run the Core ML model on
MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
//config.computeUnits = MLComputeUnitsCPUAndGPU;
// config.computeUnits = MLComputeUnitsCPUAndGPU;
//config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
config.computeUnits = MLComputeUnitsAll;

Expand Down
52 changes: 41 additions & 11 deletions cpp/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ static void remove_allocated_tensor(wsp_ggml_tallocr_t alloc, struct wsp_ggml_te

// check if a tensor is allocated by this buffer
static bool wsp_ggml_tallocr_is_own(wsp_ggml_tallocr_t alloc, const struct wsp_ggml_tensor * tensor) {
return tensor->buffer == alloc->buffer;
return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
}

static bool wsp_ggml_is_view(struct wsp_ggml_tensor * t) {
Expand Down Expand Up @@ -102,8 +102,6 @@ void wsp_ggml_tallocr_alloc(wsp_ggml_tallocr_t alloc, struct wsp_ggml_tensor * t
}
}

AT_PRINTF("block %d\n", best_fit_block);

if (best_fit_block == -1) {
// the last block is our last resort
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
Expand All @@ -117,6 +115,7 @@ void wsp_ggml_tallocr_alloc(wsp_ggml_tallocr_t alloc, struct wsp_ggml_tensor * t
return;
}
}

struct free_block * block = &alloc->free_blocks[best_fit_block];
void * addr = block->addr;
block->addr = (char*)block->addr + size;
Expand All @@ -129,6 +128,8 @@ void wsp_ggml_tallocr_alloc(wsp_ggml_tallocr_t alloc, struct wsp_ggml_tensor * t
}
}

AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);

tensor->data = addr;
tensor->buffer = alloc->buffer;
if (!alloc->measure) {
Expand Down Expand Up @@ -229,6 +230,7 @@ void wsp_ggml_tallocr_reset(wsp_ggml_tallocr_t alloc) {
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
} else {
alloc->free_blocks[0].size = wsp_ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
wsp_ggml_backend_buffer_reset(alloc->buffer);
}
}

Expand Down Expand Up @@ -263,9 +265,9 @@ wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure(size_t alignment) {
return alloc;
}

wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure_from_backend(struct wsp_ggml_backend * backend) {
wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure_from_buft(struct wsp_ggml_backend_buffer_type * buft) {
// create a backend buffer to get the correct tensor allocation sizes
wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_alloc_buffer(backend, 1);
wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_buft_alloc_buffer(buft, 1);

// TODO: move alloc initialization to a common wsp_ggml_tallocr_new_impl function
wsp_ggml_tallocr_t alloc = wsp_ggml_tallocr_new_from_buffer(buffer);
Expand All @@ -275,13 +277,22 @@ wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure_from_backend(struct wsp_ggml_bac
return alloc;
}

wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_backend(struct wsp_ggml_backend * backend, size_t size) {
wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_alloc_buffer(backend, size);
wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure_from_backend(struct wsp_ggml_backend * backend) {
return wsp_ggml_tallocr_new_measure_from_buft(wsp_ggml_backend_get_default_buffer_type(backend));
}

wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_buft(struct wsp_ggml_backend_buffer_type * buft, size_t size) {
// create a backend buffer to get the correct tensor allocation sizes
wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_buft_alloc_buffer(buft, size);
wsp_ggml_tallocr_t alloc = wsp_ggml_tallocr_new_from_buffer(buffer);
alloc->buffer_owned = true;
return alloc;
}

wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_backend(struct wsp_ggml_backend * backend, size_t size) {
return wsp_ggml_tallocr_new_from_buft(wsp_ggml_backend_get_default_buffer_type(backend), size);
}

wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_buffer(struct wsp_ggml_backend_buffer * buffer) {
wsp_ggml_tallocr_t alloc = (wsp_ggml_tallocr_t)malloc(sizeof(struct wsp_ggml_tallocr));

Expand Down Expand Up @@ -449,11 +460,10 @@ static void init_view(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * view,
if (update_backend) {
view->backend = view->view_src->backend;
}
view->buffer = view->view_src->buffer;
// views are initialized in the alloc buffer rather than the view_src buffer
view->buffer = alloc->buffer;
view->data = (char *)view->view_src->data + view->view_offs;

// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
// due to the wsp_ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
assert(wsp_ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);

if (!alloc->measure) {
Expand Down Expand Up @@ -736,6 +746,10 @@ void wsp_ggml_allocr_set_parse_seq(wsp_ggml_allocr_t alloc, const int * list, in
}

void wsp_ggml_allocr_free(wsp_ggml_allocr_t alloc) {
if (alloc == NULL) {
return;
}

wsp_ggml_gallocr_free(alloc->galloc);
wsp_ggml_tallocr_free(alloc->talloc);
free(alloc);
Expand Down Expand Up @@ -775,11 +789,22 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors_from_buft(struct ws
}

if (nbytes == 0) {
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
// all the tensors in the context are already allocated
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}

wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_buft_alloc_buffer(buft, nbytes);
if (buffer == NULL) {
// failed to allocate buffer
#ifndef NDEBUG
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
#endif
return NULL;
}

wsp_ggml_tallocr_t tallocr = wsp_ggml_tallocr_new_from_buffer(buffer);

for (struct wsp_ggml_tensor * t = wsp_ggml_get_first_tensor(ctx); t != NULL; t = wsp_ggml_get_next_tensor(ctx, t)) {
Expand All @@ -789,6 +814,11 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors_from_buft(struct ws
} else {
wsp_ggml_backend_view_init(buffer, t);
}
} else {
if (t->view_src != NULL) {
// view of a pre-allocated tensor
wsp_ggml_backend_view_init(buffer, t);
}
}
}

Expand Down
4 changes: 3 additions & 1 deletion cpp/ggml-alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@ typedef struct wsp_ggml_tallocr * wsp_ggml_tallocr_t;

WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new(void * data, size_t size, size_t alignment);
WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure(size_t alignment);
WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_buffer(struct wsp_ggml_backend_buffer * buffer);
WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_buft(struct wsp_ggml_backend_buffer_type * buft, size_t size);
WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_backend(struct wsp_ggml_backend * backend, size_t size); // allocates an owned buffer
WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_from_buffer(struct wsp_ggml_backend_buffer * buffer);
WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure_from_buft(struct wsp_ggml_backend_buffer_type * buft);
WSP_GGML_API wsp_ggml_tallocr_t wsp_ggml_tallocr_new_measure_from_backend(struct wsp_ggml_backend * backend);

WSP_GGML_API struct wsp_ggml_backend_buffer * wsp_ggml_tallocr_get_buffer(wsp_ggml_tallocr_t talloc);
Expand Down
72 changes: 38 additions & 34 deletions cpp/ggml-backend-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,14 @@ extern "C" {
typedef void * wsp_ggml_backend_buffer_type_context_t;

struct wsp_ggml_backend_buffer_type_i {
wsp_ggml_backend_buffer_t (*alloc_buffer) (wsp_ggml_backend_buffer_type_t buft, size_t size);
size_t (*get_alignment) (wsp_ggml_backend_buffer_type_t buft); // tensor alignment
size_t (*get_alloc_size) (wsp_ggml_backend_buffer_type_t buft, struct wsp_ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*supports_backend)(wsp_ggml_backend_buffer_type_t buft, wsp_ggml_backend_t backend); // check if the buffer type is usable by the backend
const char * (*WSP_GGML_CALL get_name) (wsp_ggml_backend_buffer_type_t buft);
wsp_ggml_backend_buffer_t (*WSP_GGML_CALL alloc_buffer) (wsp_ggml_backend_buffer_type_t buft, size_t size);
size_t (*WSP_GGML_CALL get_alignment) (wsp_ggml_backend_buffer_type_t buft); // tensor alignment
size_t (*WSP_GGML_CALL get_alloc_size) (wsp_ggml_backend_buffer_type_t buft, const struct wsp_ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*WSP_GGML_CALL supports_backend)(wsp_ggml_backend_buffer_type_t buft, wsp_ggml_backend_t backend); // check if the buffer type is usable by the backend
// check if tensor data is in host memory
// should be equivalent to supports_backend(buft, wsp_ggml_backend_cpu_init())
bool (*WSP_GGML_CALL is_host) (wsp_ggml_backend_buffer_type_t buft);
};

struct wsp_ggml_backend_buffer_type {
Expand All @@ -31,30 +35,33 @@ extern "C" {
typedef void * wsp_ggml_backend_buffer_context_t;

struct wsp_ggml_backend_buffer_i {
void (*free_buffer)(wsp_ggml_backend_buffer_t buffer);
//void (*reset) (wsp_ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
void * (*get_base) (wsp_ggml_backend_buffer_t buffer);
void (*init_tensor)(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor);
void (*set_tensor) (wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor) (wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size);
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
void (*cpy_tensor_from)(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
void (*cpy_tensor_to) (wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
const char * (*WSP_GGML_CALL get_name) (wsp_ggml_backend_buffer_t buffer);
void (*WSP_GGML_CALL free_buffer)(wsp_ggml_backend_buffer_t buffer);
void * (*WSP_GGML_CALL get_base) (wsp_ggml_backend_buffer_t buffer);
void (*WSP_GGML_CALL init_tensor)(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor);
void (*WSP_GGML_CALL set_tensor) (wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*WSP_GGML_CALL get_tensor) (wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*WSP_GGML_CALL cpy_tensor) (wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
void (*WSP_GGML_CALL clear) (wsp_ggml_backend_buffer_t buffer, uint8_t value);
void (*WSP_GGML_CALL reset) (wsp_ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
};

struct wsp_ggml_backend_buffer {
struct wsp_ggml_backend_buffer_i iface;
wsp_ggml_backend_buffer_type_t buft;
wsp_ggml_backend_buffer_context_t context;
size_t size;
enum wsp_ggml_backend_buffer_usage usage;
};

wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
WSP_GGML_CALL wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
wsp_ggml_backend_buffer_type_t buft,
struct wsp_ggml_backend_buffer_i iface,
wsp_ggml_backend_buffer_context_t context,
size_t size);

// do not use directly, use wsp_ggml_backend_tensor_copy instead
bool wsp_ggml_backend_buffer_copy_tensor(const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);

//
// Backend
Expand All @@ -63,33 +70,31 @@ extern "C" {
typedef void * wsp_ggml_backend_context_t;

struct wsp_ggml_backend_i {
const char * (*get_name)(wsp_ggml_backend_t backend);
const char * (*WSP_GGML_CALL get_name)(wsp_ggml_backend_t backend);

void (*free)(wsp_ggml_backend_t backend);
void (*WSP_GGML_CALL free)(wsp_ggml_backend_t backend);

// buffer allocation
wsp_ggml_backend_buffer_type_t (*get_default_buffer_type)(wsp_ggml_backend_t backend);
wsp_ggml_backend_buffer_type_t (*WSP_GGML_CALL get_default_buffer_type)(wsp_ggml_backend_t backend);

// (optional) asynchroneous tensor data access
void (*set_tensor_async)(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size);
// (optional) asynchronous tensor data access
void (*WSP_GGML_CALL set_tensor_async)(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*WSP_GGML_CALL get_tensor_async)(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*WSP_GGML_CALL cpy_tensor_async)(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);

// (optional) asynchroneous tensor copy
void (*cpy_tensor_from_async)(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
void (*cpy_tensor_to_async) (wsp_ggml_backend_t backend, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);

void (*synchronize) (wsp_ggml_backend_t backend);
// (optional) complete all pending operations
void (*WSP_GGML_CALL synchronize)(wsp_ggml_backend_t backend);

// compute graph with a plan
wsp_ggml_backend_graph_plan_t (*graph_plan_create) (wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph);
void (*graph_plan_free) (wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan);
void (*graph_plan_compute)(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan);
wsp_ggml_backend_graph_plan_t (*WSP_GGML_CALL graph_plan_create) (wsp_ggml_backend_t backend, const struct wsp_ggml_cgraph * cgraph);
void (*WSP_GGML_CALL graph_plan_free) (wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan);
void (*WSP_GGML_CALL graph_plan_compute)(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan);

// compute graph without a plan
void (*graph_compute)(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph);
// compute graph without a plan (async)
bool (*WSP_GGML_CALL graph_compute)(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph);

// check if the backend supports an operation
bool (*supports_op)(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op);
bool (*WSP_GGML_CALL supports_op)(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op);
};

struct wsp_ggml_backend {
Expand All @@ -98,14 +103,13 @@ extern "C" {
wsp_ggml_backend_context_t context;
};


//
// Backend registry
//

typedef wsp_ggml_backend_t (*wsp_ggml_backend_init_fn)(const char * params, void * user_data);
typedef wsp_ggml_backend_t (*WSP_GGML_CALL wsp_ggml_backend_init_fn)(const char * params, void * user_data);

void wsp_ggml_backend_register(const char * name, wsp_ggml_backend_init_fn init_fn, wsp_ggml_backend_buffer_type_t default_buffer_type, void * user_data);
WSP_GGML_CALL void wsp_ggml_backend_register(const char * name, wsp_ggml_backend_init_fn init_fn, wsp_ggml_backend_buffer_type_t default_buffer_type, void * user_data);

#ifdef __cplusplus
}
Expand Down
Loading

0 comments on commit 7109406

Please sign in to comment.