Add keyword spotting API for node-addon-api (#877)

csukuangfj · web-flow · commit 03c956a31753 · 2024-05-14T20:26:48.000+08:00
diff --git a/.github/scripts/node-addon/run.sh b/.github/scripts/node-addon/run.sh
@@ -18,7 +18,7 @@ fi
 SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt  | cut -d " " -f 2  | cut -d '"' -f 2)
 echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION"
 
-# SHERPA_ONNX_VERSION=1.0.23
+# SHERPA_ONNX_VERSION=1.0.24
 
 if [ -z $owner ]; then
   owner=k2-fsa
diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh
@@ -6,6 +6,15 @@ d=nodejs-addon-examples
 echo "dir: $d"
 cd $d
 
+echo "----------keyword spotting----------"
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/kws-models/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
+tar xvf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
+rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
+
+node ./test_keyword_spotter_transducer.js
+rm -rf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01
+
 echo "----------add punctuations----------"
 
 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/punctuation-models/sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2
diff --git a/.github/workflows/npm-addon.yaml b/.github/workflows/npm-addon.yaml
@@ -55,7 +55,7 @@ jobs:
 
           SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt  | cut -d " " -f 2  | cut -d '"' -f 2)
           echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION"
-          # SHERPA_ONNX_VERSION=1.0.23
+          # SHERPA_ONNX_VERSION=1.0.24
 
           src_dir=.github/scripts/node-addon
           sed -i.bak s/SHERPA_ONNX_VERSION/$SHERPA_ONNX_VERSION/g $src_dir/package.json
diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md
@@ -62,6 +62,13 @@ The following tables list the examples in this folder.
 |[./test_audio_tagging_zipformer.js](./test_audio_tagging_zipformer.js)| Audio tagging with a Zipformer model|
 |[./test_audio_tagging_ced.js](./test_audio_tagging_ced.js)| Audio tagging with a [CED](https://github.com/RicherMans/CED) model|
 
+## Keyword spotting
+
+|File| Description|
+|---|---|
+|[./test_keyword_spotter_transducer.js](./test_keyword_spotter_transducer.js)| Keyword spotting from a file using a Zipformer model|
+|[./test_keyword_spotter_transducer_microphone.js](./test_keyword_spotter_transducer_microphone.js)| Keyword spotting from a microphone using a Zipformer model|
+
 ## Streaming speech-to-text from files
 
 |File| Description|
@@ -325,3 +332,17 @@ rm sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2
 
 node ./test_punctuation.js
 ```
+
+## Keyword spotting
+
+```bash
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/kws-models/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
+tar xvf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
+rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
+
+node ./test_keyword_spotter_transducer.js
+
+# To run keyword spotting using a microphone
+npm install naudiodon2
+node ./test_keyword_spotter_transducer_microphone.js
+```
diff --git a/nodejs-addon-examples/test_asr_streaming_ctc_hlg_microphone.js b/nodejs-addon-examples/test_asr_streaming_ctc_hlg_microphone.js
@@ -79,11 +79,5 @@ ai.on('data', data => {
   }
 });
 
-ai.on('close', () => {
-  console.log('Free resources');
-  stream.free();
-  recognizer.free();
-});
-
 ai.start();
 console.log('Started! Please speak')
diff --git a/nodejs-addon-examples/test_asr_streaming_ctc_microphone.js b/nodejs-addon-examples/test_asr_streaming_ctc_microphone.js
@@ -78,11 +78,6 @@ ai.on('data', data => {
   }
 });
 
-ai.on('close', () => {
-  console.log('Free resources');
-  stream.free();
-  recognizer.free();
-});
 
 ai.start();
 console.log('Started! Please speak')
diff --git a/nodejs-addon-examples/test_asr_streaming_paraformer_microphone.js b/nodejs-addon-examples/test_asr_streaming_paraformer_microphone.js
@@ -94,11 +94,5 @@ ai.on('data', data => {
   }
 });
 
-ai.on('close', () => {
-  console.log('Free resources');
-  stream.free();
-  recognizer.free();
-});
-
 ai.start();
 console.log('Started! Please speak')
diff --git a/nodejs-addon-examples/test_asr_streaming_transducer_microphone.js b/nodejs-addon-examples/test_asr_streaming_transducer_microphone.js
@@ -82,11 +82,5 @@ ai.on('data', data => {
   }
 });
 
-ai.on('close', () => {
-  console.log('Free resources');
-  stream.free();
-  recognizer.free();
-});
-
 ai.start();
 console.log('Started! Please speak')
diff --git a/nodejs-addon-examples/test_keyword_spotter_transducer.js b/nodejs-addon-examples/test_keyword_spotter_transducer.js
@@ -0,0 +1,66 @@
+// Copyright (c)  2024  Xiaomi Corporation
+const sherpa_onnx = require('sherpa-onnx-node');
+const performance = require('perf_hooks').performance;
+
+
+// Please download test files from
+// https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models
+const config = {
+  'featConfig': {
+    'sampleRate': 16000,
+    'featureDim': 80,
+  },
+  'modelConfig': {
+    'transducer': {
+      'encoder':
+          './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.onnx',
+      'decoder':
+          './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.onnx',
+      'joiner':
+          './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.onnx',
+    },
+    'tokens':
+        './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt',
+    'numThreads': 1,
+    'provider': 'cpu',
+    'debug': 1,
+  },
+  'keywordsFile':
+      './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/test_keywords.txt',
+};
+
+const waveFilename =
+    './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav';
+
+const kws = new sherpa_onnx.KeywordSpotter(config);
+console.log('Started')
+let start = performance.now();
+const stream = kws.createStream();
+const wave = sherpa_onnx.readWave(waveFilename);
+stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
+
+const tailPadding = new Float32Array(wave.sampleRate * 0.4);
+stream.acceptWaveform({samples: tailPadding, sampleRate: wave.sampleRate});
+
+const detectedKeywords = [];
+while (kws.isReady(stream)) {
+  const keyword = kws.getResult(stream).keyword;
+  if (keyword != '') {
+    detectedKeywords.push(keyword);
+  }
+  kws.decode(stream);
+}
+let stop = performance.now();
+
+console.log('Done')
+
+const elapsed_seconds = (stop - start) / 1000;
+const duration = wave.samples.length / wave.sampleRate;
+const real_time_factor = elapsed_seconds / duration;
+console.log('Wave duration', duration.toFixed(3), 'secodns')
+console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
+console.log(
+    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
+    real_time_factor.toFixed(3))
+console.log(waveFilename)
+console.log('result\n', detectedKeywords)
diff --git a/nodejs-addon-examples/test_keyword_spotter_transducer_microphone.js b/nodejs-addon-examples/test_keyword_spotter_transducer_microphone.js
@@ -0,0 +1,74 @@
+// Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+//
+const portAudio = require('naudiodon2');
+// console.log(portAudio.getDevices());
+
+const sherpa_onnx = require('sherpa-onnx-node');
+
+function createKeywordSpotter() {
+  const config = {
+    'featConfig': {
+      'sampleRate': 16000,
+      'featureDim': 80,
+    },
+    'modelConfig': {
+      'transducer': {
+        'encoder':
+            './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.onnx',
+        'decoder':
+            './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.onnx',
+        'joiner':
+            './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.onnx',
+      },
+      'tokens':
+          './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt',
+      'numThreads': 2,
+      'provider': 'cpu',
+      'debug': 1,
+    },
+    'keywordsFile':
+        './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/keywords.txt',
+  };
+
+  return new sherpa_onnx.KeywordSpotter(config);
+}
+
+const kws = createKeywordSpotter();
+const stream = kws.createStream();
+
+let lastText = '';
+let segmentIndex = 0;
+
+const ai = new portAudio.AudioIO({
+  inOptions: {
+    channelCount: 1,
+    closeOnError: true,  // Close the stream if an audio error is detected, if
+                         // set false then just log the error
+    deviceId: -1,  // Use -1 or omit the deviceId to select the default device
+    sampleFormat: portAudio.SampleFormatFloat32,
+    sampleRate: kws.config.featConfig.sampleRate
+  }
+});
+
+const display = new sherpa_onnx.Display(50);
+
+ai.on('data', data => {
+  const samples = new Float32Array(data.buffer);
+
+  stream.acceptWaveform(
+      {sampleRate: kws.config.featConfig.sampleRate, samples: samples});
+
+  while (kws.isReady(stream)) {
+    kws.decode(stream);
+  }
+
+  const keyword = kws.getResult(stream).keyword
+  if (keyword != '') {
+    display.print(segmentIndex, keyword);
+    segmentIndex += 1;
+  }
+});
+
+ai.start();
+console.log('Started! Please speak.')
+console.log(`Only words from ${kws.config.keywordsFile} can be recognized`)
diff --git a/scripts/node-addon-api/CMakeLists.txt b/scripts/node-addon-api/CMakeLists.txt
@@ -19,6 +19,7 @@ include_directories(${CMAKE_JS_INC})
 
 set(srcs
   src/audio-tagging.cc
+  src/keyword-spotting.cc
   src/non-streaming-asr.cc
   src/non-streaming-tts.cc
   src/punctuation.cc
diff --git a/scripts/node-addon-api/lib/keyword-spotter.js b/scripts/node-addon-api/lib/keyword-spotter.js
@@ -0,0 +1,32 @@
+const addon = require('./addon.js');
+const streaming_asr = require('./streaming-asr.js');
+
+class KeywordSpotter {
+  constructor(config) {
+    this.handle = addon.createKeywordSpotter(config);
+    this.config = config
+  }
+
+  createStream() {
+    const handle = addon.createKeywordStream(this.handle);
+    return new streaming_asr.OnlineStream(handle);
+  }
+
+  isReady(stream) {
+    return addon.isKeywordStreamReady(this.handle, stream.handle);
+  }
+
+  decode(stream) {
+    addon.decodeKeywordStream(this.handle, stream.handle);
+  }
+
+  getResult(stream) {
+    const jsonStr = addon.getKeywordResultAsJson(this.handle, stream.handle);
+
+    return JSON.parse(jsonStr);
+  }
+}
+
+module.exports = {
+  KeywordSpotter,
+}
diff --git a/scripts/node-addon-api/lib/sherpa-onnx.js b/scripts/node-addon-api/lib/sherpa-onnx.js
@@ -7,6 +7,7 @@ const slid = require('./spoken-language-identification.js');
 const sid = require('./speaker-identification.js');
 const at = require('./audio-tagg.js');
 const punct = require('./punctuation.js');
+const kws = require('./keyword-spotter.js');
 
 module.exports = {
   OnlineRecognizer: streaming_asr.OnlineRecognizer,
@@ -22,4 +23,5 @@ module.exports = {
   SpeakerEmbeddingManager: sid.SpeakerEmbeddingManager,
   AudioTagging: at.AudioTagging,
   Punctuation: punct.Punctuation,
+  KeywordSpotter: kws.KeywordSpotter,
 }
diff --git a/scripts/node-addon-api/src/keyword-spotting.cc b/scripts/node-addon-api/src/keyword-spotting.cc
diff --git a/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc b/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc
diff --git a/scripts/node-addon-api/src/streaming-asr.cc b/scripts/node-addon-api/src/streaming-asr.cc
diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc
diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h