Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integer quantisation support #540

Merged
merged 15 commits into from
Apr 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ build-sanitize-thread/
/talk
/talk-llama
/bench
/quantize

arm_neon.h
sync.sh
Expand Down
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,12 @@ if (BUILD_SHARED_LIBS)

target_compile_definitions(${TARGET} PUBLIC
WHISPER_SHARED
GGML_SHARED
)

target_compile_definitions(${TARGET} PRIVATE
WHISPER_BUILD
GGML_BUILD
)
endif()

Expand Down
9 changes: 6 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
default: main bench
default: main bench quantize

ifndef UNAME_S
UNAME_S := $(shell uname -s)
Expand Down Expand Up @@ -243,15 +243,15 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)

clean:
rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so

#
# Examples
#

CC_SDL=`sdl2-config --cflags --libs`

SRC_COMMON = examples/common.cpp
SRC_COMMON = examples/common.cpp examples/common-ggml.cpp
SRC_COMMON_SDL = examples/common-sdl.cpp

main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
Expand All @@ -261,6 +261,9 @@ main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)

quantize: examples/quantize/quantize.cpp ggml.o $(WHISPER_OBJ) $(SRC_COMMON)
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o quantize $(LDFLAGS)

stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)

Expand Down
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
- AVX intrinsics support for x86 architectures
- VSX intrinsics support for POWER architectures
- Mixed F16 / F32 precision
- [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
- Low memory usage (Flash Attention)
- Zero memory allocations at runtime
- Runs on the CPU
Expand Down Expand Up @@ -228,6 +229,22 @@ make large
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |

## Quantization

`whisper.cpp` supports integer quantization of the Whisper `ggml` models.
Quantized models require less memory and disk space and depending on the hardware can be processed more efficiently.

Here are the steps for creating and using a quantized model:

```bash
# quantize a model with Q5_0 method
make quantize
./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0

# run the examples as usual, specifying the quantized model file
./main -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
```

## Core ML support

On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant
Expand Down
2 changes: 1 addition & 1 deletion bindings/javascript/whisper.js

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,14 @@ set(TARGET common)
add_library(${TARGET} STATIC
common.h
common.cpp
common-ggml.h
common-ggml.cpp
)

include(DefaultTargetOptions)

target_link_libraries(${TARGET} PRIVATE whisper)

set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)

if (WHISPER_SDL2)
Expand Down Expand Up @@ -62,6 +66,7 @@ else()
add_subdirectory(stream)
add_subdirectory(command)
add_subdirectory(bench)
add_subdirectory(quantize)
add_subdirectory(talk)
add_subdirectory(talk-llama)
endif()
9 changes: 5 additions & 4 deletions examples/addon.node/__test__/whisper.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ const whisperParamsMock = {
};

describe("Run whisper.node", () => {
test("it should receive a non-empty value", async () => {
let result = await whisperAsync(whisperParamsMock);
test("it should receive a non-empty value", async () => {
let result = await whisperAsync(whisperParamsMock);

expect(result.length).toBeGreaterThan(0);
});
expect(result.length).toBeGreaterThan(0);
}, 10000);
});

6 changes: 3 additions & 3 deletions examples/bench.wasm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ endif()
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
--bind \
-s USE_PTHREADS=1 \
-s PTHREAD_POOL_SIZE=8 \
-s INITIAL_MEMORY=1024MB \
-s TOTAL_MEMORY=1024MB \
-s PTHREAD_POOL_SIZE_STRICT=0 \
-s INITIAL_MEMORY=2000MB \
-s TOTAL_MEMORY=2000MB \
-s FORCE_FILESYSTEM=1 \
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
${EXTRA_FLAGS} \
Expand Down
74 changes: 65 additions & 9 deletions examples/bench.wasm/index-tmpl.html
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@

<br><br>

<b>More examples:</b>
<a href="https://whisper.ggerganov.com/">main</a> |
<a href="https://whisper.ggerganov.com/bench">bench</a> |
<a href="https://whisper.ggerganov.com/stream">stream</a> |
<a href="https://whisper.ggerganov.com/command">command</a> |
<a href="https://whisper.ggerganov.com/talk">talk</a> |

<br><br>

<hr>

Select the model you would like to use and click the "Bench" button.<br>
Expand All @@ -44,11 +53,18 @@

<div id="model-whisper">
Whisper model: <span id="model-whisper-status"></span>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<span id="fetch-whisper-progress"></span>

<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
<input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
<br><br>
Quantized models:<br><br>
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
<button id="fetch-whisper-small-en-q5_1" onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
<button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
<button id="fetch-whisper-large-q5_0" onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
<span id="fetch-whisper-progress"></span>
</div>

<br>
Expand Down Expand Up @@ -160,6 +176,14 @@

document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-small-en').style.display = 'none';

document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';

document.getElementById('whisper-file' ).style.display = 'none';
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
}
Expand All @@ -168,19 +192,42 @@
let urls = {
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',

'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
'large-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
};

let sizes = {
'tiny.en': 75,
'base.en': 142,
'small.en': 466,

'tiny-en-q5_1': 31,
'base-en-q5_1': 57,
'small-en-q5_1': 182,
'medium-en-q5_0': 515,
'large-q5_0': 1030,
};

let url = urls[model];
let dst = 'whisper.bin';
let size_mb = sizes[model];

document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-small-en').style.display = 'none';

document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';

document.getElementById('whisper-file' ).style.display = 'none';
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';

cbProgress = function(p) {
Expand All @@ -190,9 +237,18 @@

cbCancel = function() {
var el;
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';

el = document.getElementById('fetch-whisper-tiny-en-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-large-q5_0' ); if (el) el.style.display = 'inline-block';

el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
};

loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
Expand Down
27 changes: 27 additions & 0 deletions examples/command.wasm/index-tmpl.html
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@

<br><br>

<b>More examples:</b>
<a href="https://whisper.ggerganov.com/">main</a> |
<a href="https://whisper.ggerganov.com/bench">bench</a> |
<a href="https://whisper.ggerganov.com/stream">stream</a> |
<a href="https://whisper.ggerganov.com/command">command</a> |
<a href="https://whisper.ggerganov.com/talk">talk</a> |

<br><br>

<hr>

Select the model you would like to use, click the "Start" button and follow the instructions.
Expand All @@ -45,6 +54,10 @@
Whisper model: <span id="model-whisper-status"></span>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<br><br>
Quantized models:<br><br>
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
<span id="fetch-whisper-progress"></span>

<!--
Expand Down Expand Up @@ -162,11 +175,17 @@
let urls = {
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',

'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
};

let sizes = {
'tiny.en': 75,
'base.en': 142,

'tiny-en-q5_1': 31,
'base-en-q5_1': 57,
};

let url = urls[model];
Expand All @@ -177,6 +196,10 @@

document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none';

document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';

document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';

cbProgress = function(p) {
Expand All @@ -188,6 +211,10 @@
var el;
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';

el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';

el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
};

Expand Down
Loading