Skip to content

Commit 388e6a9

Browse files
authored
Add speaker identification APIs for node-addon-api (#874)
1 parent 0895b64 commit 388e6a9

16 files changed

+1034
-3
lines changed

.github/scripts/node-addon/run.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ fi
1818
SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
1919
echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION"
2020

21-
# SHERPA_ONNX_VERSION=1.0.20
21+
# SHERPA_ONNX_VERSION=1.0.21
2222

2323
if [ -z $owner ]; then
2424
owner=k2-fsa

.github/scripts/test-nodejs-addon-npm.sh

+10
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,16 @@ d=nodejs-addon-examples
66
echo "dir: $d"
77
cd $d
88

9+
echo "----------speaker identification----------"
10+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
11+
12+
git clone https://github.com/csukuangfj/sr-data
13+
14+
node ./test_speaker_identification.js
15+
16+
rm *.onnx
17+
rm -rf sr-data
18+
919
echo "----------spoken language identification----------"
1020

1121
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2

.github/workflows/npm-addon-linux-aarch64.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
name: npm-addon-linux-aarch64
22

33
on:
4+
push:
5+
branches:
6+
- node-addon
47
workflow_dispatch:
58

69
concurrency:

.github/workflows/npm-addon-linux-x64.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
name: npm-addon-linux-x64
22

33
on:
4+
push:
5+
branches:
6+
- node-addon
47
workflow_dispatch:
58

69
concurrency:

.github/workflows/npm-addon-macos.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
name: npm-addon-macos
22

33
on:
4+
push:
5+
branches:
6+
- node-addon
47
workflow_dispatch:
58

69
concurrency:

.github/workflows/npm-addon-win-x64.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
name: npm-addon-win-x64
22

33
on:
4+
push:
5+
branches:
6+
- node-addon
47
workflow_dispatch:
58

69
concurrency:

.github/workflows/npm-addon.yaml

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
name: npm-addon
22

33
on:
4+
push:
5+
branches:
6+
- node-addon
47
workflow_dispatch:
58

69
concurrency:
@@ -52,7 +55,7 @@ jobs:
5255
5356
SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
5457
echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION"
55-
# SHERPA_ONNX_VERSION=1.0.20
58+
# SHERPA_ONNX_VERSION=1.0.21
5659
5760
src_dir=.github/scripts/node-addon
5861
sed -i.bak s/SHERPA_ONNX_VERSION/$SHERPA_ONNX_VERSION/g $src_dir/package.json

nodejs-addon-examples/README.md

+13
Original file line numberDiff line numberDiff line change
@@ -201,3 +201,16 @@ node ./test_spoken_language_identification.js
201201
npm install naudiodon2
202202
node ./test_vad_spoken_language_identification_microphone.js
203203
```
204+
205+
## Speaker identification
206+
207+
You can find more models at
208+
<https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models>
209+
210+
```bash
211+
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
212+
213+
git clone https://github.com/csukuangfj/sr-data
214+
215+
node ./test_speaker_identification.js
216+
```

nodejs-addon-examples/test_asr_non_streaming_nemo_ctc.js

-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
const sherpa_onnx = require('sherpa-onnx-node');
33
const performance = require('perf_hooks').performance;
44

5-
65
// Please download test files from
76
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
87
const config = {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
// Copyright (c) 2024 Xiaomi Corporation
2+
const sherpa_onnx = require('sherpa-onnx-node');
3+
const assert = require('node:assert');
4+
5+
// Please download models files from
6+
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
7+
function createSpeakerEmbeddingExtractor() {
8+
const config = {
9+
model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx',
10+
numThreads: 1,
11+
debug: true,
12+
};
13+
return new sherpa_onnx.SpeakerEmbeddingExtractor(config);
14+
}
15+
16+
function computeEmbedding(extractor, filename) {
17+
const stream = extractor.createStream();
18+
const wave = sherpa_onnx.readWave(filename);
19+
stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
20+
return extractor.compute(stream);
21+
}
22+
23+
const extractor = createSpeakerEmbeddingExtractor();
24+
const manager = new sherpa_onnx.SpeakerEmbeddingManager(extractor.dim);
25+
26+
// Please download test files from
27+
// https://github.com/csukuangfj/sr-data
28+
const spk1Files = [
29+
'./sr-data/enroll/fangjun-sr-1.wav',
30+
'./sr-data/enroll/fangjun-sr-2.wav',
31+
'./sr-data/enroll/fangjun-sr-3.wav',
32+
];
33+
34+
let spk1Vec = [];
35+
for (let f of spk1Files) {
36+
spk1Vec.push(computeEmbedding(extractor, f));
37+
}
38+
39+
const spk2Files = [
40+
'./sr-data/enroll/leijun-sr-1.wav',
41+
'./sr-data/enroll/leijun-sr-2.wav',
42+
];
43+
44+
let spk2Vec = [];
45+
for (let f of spk2Files) {
46+
spk2Vec.push(computeEmbedding(extractor, f));
47+
}
48+
49+
let ok = manager.addMulti({name: 'fangjun', v: spk1Vec});
50+
assert.equal(ok, true);
51+
52+
ok = manager.addMulti({name: 'leijun', v: spk2Vec});
53+
assert.equal(ok, true);
54+
55+
assert.equal(manager.getNumSpeakers(), 2);
56+
57+
assert.equal(manager.contains('fangjun'), true);
58+
assert.equal(manager.contains('leijun'), true);
59+
60+
console.log('---All speakers---');
61+
62+
console.log(manager.getAllSpeakerNames());
63+
console.log('------------');
64+
65+
const testFiles = [
66+
'./sr-data/test/fangjun-test-sr-1.wav',
67+
'./sr-data/test/leijun-test-sr-1.wav',
68+
'./sr-data/test/liudehua-test-sr-1.wav',
69+
];
70+
71+
const threshold = 0.6;
72+
73+
for (let f of testFiles) {
74+
const embedding = computeEmbedding(extractor, f);
75+
76+
let name = manager.search({v: embedding, threshold: threshold});
77+
if (name == '') {
78+
name = '<Unknown>';
79+
}
80+
console.log(`${f}: ${name}`);
81+
}
82+
83+
84+
ok = manager.verify({
85+
name: 'fangjun',
86+
v: computeEmbedding(extractor, testFiles[0]),
87+
threshold: threshold
88+
});
89+
90+
assert.equal(ok, true);
91+
92+
ok = manager.remove('fangjun');
93+
assert.equal(ok, true);
94+
95+
ok = manager.verify({
96+
name: 'fangjun',
97+
v: computeEmbedding(extractor, testFiles[0]),
98+
threshold: threshold
99+
});
100+
assert.equal(ok, false);
101+
102+
assert.equal(manager.getNumSpeakers(), 1);

scripts/node-addon-api/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ set(srcs
2121
src/non-streaming-asr.cc
2222
src/non-streaming-tts.cc
2323
src/sherpa-onnx-node-addon-api.cc
24+
src/speaker-identification.cc
2425
src/spoken-language-identification.cc
2526
src/streaming-asr.cc
2627
src/vad.cc

scripts/node-addon-api/lib/sherpa-onnx.js

+3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ const non_streaming_asr = require('./non-streaming-asr.js');
44
const non_streaming_tts = require('./non-streaming-tts.js');
55
const vad = require('./vad.js');
66
const slid = require('./spoken-language-identification.js');
7+
const sid = require('./speaker-identification.js');
78

89
module.exports = {
910
OnlineRecognizer: streaming_asr.OnlineRecognizer,
@@ -15,4 +16,6 @@ module.exports = {
1516
Vad: vad.Vad,
1617
CircularBuffer: vad.CircularBuffer,
1718
SpokenLanguageIdentification: slid.SpokenLanguageIdentification,
19+
SpeakerEmbeddingExtractor: sid.SpeakerEmbeddingExtractor,
20+
SpeakerEmbeddingManager: sid.SpeakerEmbeddingManager,
1821
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
const addon = require('./addon.js');
2+
const streaming_asr = require('./streaming-asr.js');
3+
4+
class SpeakerEmbeddingExtractor {
5+
constructor(config) {
6+
this.handle = addon.createSpeakerEmbeddingExtractor(config);
7+
this.config = config;
8+
this.dim = addon.speakerEmbeddingExtractorDim(this.handle);
9+
}
10+
11+
createStream() {
12+
return new streaming_asr.OnlineStream(
13+
addon.speakerEmbeddingExtractorCreateStream(this.handle));
14+
}
15+
16+
isReady(stream) {
17+
return addon.speakerEmbeddingExtractorIsReady(this.handle, stream.handle);
18+
}
19+
20+
// return a float32 array
21+
compute(stream) {
22+
return addon.speakerEmbeddingExtractorComputeEmbedding(
23+
this.handle, stream.handle);
24+
}
25+
}
26+
27+
function flatten(arrayList) {
28+
let n = 0;
29+
for (let i = 0; i < arrayList.length; ++i) {
30+
n += arrayList[i].length;
31+
}
32+
let ans = new Float32Array(n);
33+
34+
let offset = 0;
35+
for (let i = 0; i < arrayList.length; ++i) {
36+
ans.set(arrayList[i], offset);
37+
offset += arrayList[i].length;
38+
}
39+
return ans;
40+
}
41+
42+
class SpeakerEmbeddingManager {
43+
constructor(dim) {
44+
this.handle = addon.createSpeakerEmbeddingManager(dim);
45+
this.dim = dim;
46+
}
47+
48+
/*
49+
obj = {name: "xxx", v: a-float32-array}
50+
*/
51+
add(obj) {
52+
return addon.speakerEmbeddingManagerAdd(this.handle, obj);
53+
}
54+
55+
/*
56+
* obj =
57+
* {name: "xxx", v: [float32_array1, float32_array2, ..., float32_arrayn]
58+
*/
59+
addMulti(obj) {
60+
const c = {
61+
name: obj.name,
62+
vv: flatten(obj.v),
63+
n: obj.v.length,
64+
};
65+
return addon.speakerEmbeddingManagerAddListFlattened(this.handle, c);
66+
}
67+
68+
remove(name) {
69+
return addon.speakerEmbeddingManagerRemove(this.handle, name);
70+
}
71+
72+
/*
73+
* obj = {v: a-float32-array, threshold: a-float }
74+
*/
75+
search(obj) {
76+
return addon.speakerEmbeddingManagerSearch(this.handle, obj);
77+
}
78+
79+
/*
80+
* obj = {name: 'xxx', v: a-float32-array, threshold: a-float }
81+
*/
82+
verify(obj) {
83+
return addon.speakerEmbeddingManagerVerify(this.handle, obj);
84+
}
85+
86+
contains(name) {
87+
return addon.speakerEmbeddingManagerContains(this.handle, name);
88+
}
89+
90+
getNumSpeakers() {
91+
return addon.speakerEmbeddingManagerNumSpeakers(this.handle);
92+
}
93+
94+
getAllSpeakerNames() {
95+
return addon.speakerEmbeddingManagerGetAllSpeakers(this.handle);
96+
}
97+
}
98+
99+
module.exports = {
100+
SpeakerEmbeddingExtractor,
101+
SpeakerEmbeddingManager,
102+
}

scripts/node-addon-api/lib/streaming-asr.js

+1
Original file line numberDiff line numberDiff line change
@@ -64,5 +64,6 @@ class OnlineRecognizer {
6464

6565
module.exports = {
6666
OnlineRecognizer,
67+
OnlineStream,
6768
Display
6869
}

scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc

+3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ void InitWaveWriter(Napi::Env env, Napi::Object exports);
1717

1818
void InitSpokenLanguageID(Napi::Env env, Napi::Object exports);
1919

20+
void InitSpeakerID(Napi::Env env, Napi::Object exports);
21+
2022
Napi::Object Init(Napi::Env env, Napi::Object exports) {
2123
InitStreamingAsr(env, exports);
2224
InitNonStreamingAsr(env, exports);
@@ -25,6 +27,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) {
2527
InitWaveReader(env, exports);
2628
InitWaveWriter(env, exports);
2729
InitSpokenLanguageID(env, exports);
30+
InitSpeakerID(env, exports);
2831

2932
return exports;
3033
}

0 commit comments

Comments
 (0)