Skip to content

Commit b012b78

Browse files
authored
Encode hotwords in C++ side (#828)
* Encode hotwords in C++ side
1 parent 8af2af8 commit b012b78

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+713
-101
lines changed

.github/scripts/test-offline-ctc.sh

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ log() {
88
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
99
}
1010

11+
export GIT_CLONE_PROTECTION_ACTIVE=false
12+
1113
echo "EXE is $EXE"
1214
echo "PATH: $PATH"
1315

.github/scripts/test-offline-transducer.sh

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ log() {
88
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
99
}
1010

11+
export GIT_CLONE_PROTECTION_ACTIVE=false
12+
1113
echo "EXE is $EXE"
1214
echo "PATH: $PATH"
1315

.github/scripts/test-offline-tts.sh

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ log() {
88
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
99
}
1010

11+
export GIT_CLONE_PROTECTION_ACTIVE=false
12+
1113
echo "EXE is $EXE"
1214
echo "PATH: $PATH"
1315

.github/scripts/test-offline-whisper.sh

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ log() {
88
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
99
}
1010

11+
export GIT_CLONE_PROTECTION_ACTIVE=false
12+
1113
echo "EXE is $EXE"
1214
echo "PATH: $PATH"
1315

.github/scripts/test-online-ctc.sh

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ log() {
88
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
99
}
1010

11+
export GIT_CLONE_PROTECTION_ACTIVE=false
12+
1113
echo "EXE is $EXE"
1214
echo "PATH: $PATH"
1315

.github/scripts/test-online-paraformer.sh

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ log() {
88
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
99
}
1010

11+
export GIT_CLONE_PROTECTION_ACTIVE=false
12+
1113
echo "EXE is $EXE"
1214
echo "PATH: $PATH"
1315

.github/scripts/test-online-transducer.sh

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ log() {
88
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
99
}
1010

11+
export GIT_CLONE_PROTECTION_ACTIVE=false
12+
1113
echo "EXE is $EXE"
1214
echo "PATH: $PATH"
1315

.github/scripts/test-python.sh

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ log() {
88
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
99
}
1010

11+
export GIT_CLONE_PROTECTION_ACTIVE=false
12+
1113
log "test online NeMo CTC"
1214

1315
url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-80ms.tar.bz2

.github/scripts/test-spoken-language-identification.sh

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ log() {
88
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
99
}
1010

11+
export GIT_CLONE_PROTECTION_ACTIVE=false
12+
1113
echo "EXE is $EXE"
1214
echo "PATH: $PATH"
1315

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ endif()
234234
include(kaldi-native-fbank)
235235
include(kaldi-decoder)
236236
include(onnxruntime)
237+
include(simple-sentencepiece)
237238
set(ONNXRUNTIME_DIR ${onnxruntime_SOURCE_DIR})
238239
message(STATUS "ONNXRUNTIME_DIR: ${ONNXRUNTIME_DIR}")
239240

build-ios-no-tts.sh

+5-3
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ echo "Generate xcframework"
126126

127127
mkdir -p "build/simulator/lib"
128128
for f in libkaldi-native-fbank-core.a libsherpa-onnx-c-api.a libsherpa-onnx-core.a \
129-
libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a; do
129+
libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a libssentencepiece_core.a; do
130130
lipo -create build/simulator_arm64/lib/${f} \
131131
build/simulator_x86_64/lib/${f} \
132132
-output build/simulator/lib/${f}
@@ -140,15 +140,17 @@ libtool -static -o build/simulator/sherpa-onnx.a \
140140
build/simulator/lib/libsherpa-onnx-core.a \
141141
build/simulator/lib/libsherpa-onnx-fst.a \
142142
build/simulator/lib/libsherpa-onnx-kaldifst-core.a \
143-
build/simulator/lib/libkaldi-decoder-core.a
143+
build/simulator/lib/libkaldi-decoder-core.a \
144+
build/simulator/lib/libssentencepiece_core.a
144145

145146
libtool -static -o build/os64/sherpa-onnx.a \
146147
build/os64/lib/libkaldi-native-fbank-core.a \
147148
build/os64/lib/libsherpa-onnx-c-api.a \
148149
build/os64/lib/libsherpa-onnx-core.a \
149150
build/os64/lib/libsherpa-onnx-fst.a \
150151
build/os64/lib/libsherpa-onnx-kaldifst-core.a \
151-
build/os64/lib/libkaldi-decoder-core.a
152+
build/os64/lib/libkaldi-decoder-core.a \
153+
build/os64/lib/libssentencepiece_core.a
152154

153155
rm -rf sherpa-onnx.xcframework
154156

build-ios.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ echo "Generate xcframework"
129129

130130
mkdir -p "build/simulator/lib"
131131
for f in libkaldi-native-fbank-core.a libsherpa-onnx-c-api.a libsherpa-onnx-core.a \
132-
libsherpa-onnx-fstfar.a \
132+
libsherpa-onnx-fstfar.a libssentencepiece_core.a \
133133
libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a \
134134
libucd.a libpiper_phonemize.a libespeak-ng.a; do
135135
lipo -create build/simulator_arm64/lib/${f} \
@@ -150,6 +150,7 @@ libtool -static -o build/simulator/sherpa-onnx.a \
150150
build/simulator/lib/libucd.a \
151151
build/simulator/lib/libpiper_phonemize.a \
152152
build/simulator/lib/libespeak-ng.a \
153+
build/simulator/lib/libssentencepiece_core.a
153154

154155
libtool -static -o build/os64/sherpa-onnx.a \
155156
build/os64/lib/libkaldi-native-fbank-core.a \
@@ -162,6 +163,7 @@ libtool -static -o build/os64/sherpa-onnx.a \
162163
build/os64/lib/libucd.a \
163164
build/os64/lib/libpiper_phonemize.a \
164165
build/os64/lib/libespeak-ng.a \
166+
build/os64/lib/libssentencepiece_core.a
165167

166168

167169
rm -rf sherpa-onnx.xcframework

build-swift-macos.sh

+2-1
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,5 @@ libtool -static -o ./install/lib/libsherpa-onnx.a \
3333
./install/lib/libkaldi-decoder-core.a \
3434
./install/lib/libucd.a \
3535
./install/lib/libpiper_phonemize.a \
36-
./install/lib/libespeak-ng.a
36+
./install/lib/libespeak-ng.a \
37+
./install/lib/libssentencepiece_core.a

cmake/simple-sentencepiece.cmake

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
function(download_simple_sentencepiece)
2+
include(FetchContent)
3+
4+
set(simple-sentencepiece_URL "https://github.com/pkufool/simple-sentencepiece/archive/refs/tags/v0.7.tar.gz")
5+
set(simple-sentencepiece_URL2 "https://hub.nauu.cf/pkufool/simple-sentencepiece/archive/refs/tags/v0.7.tar.gz")
6+
set(simple-sentencepiece_HASH "SHA256=1748a822060a35baa9f6609f84efc8eb54dc0e74b9ece3d82367b7119fdc75af")
7+
8+
# If you don't have access to the Internet,
9+
# please pre-download simple-sentencepiece
10+
set(possible_file_locations
11+
$ENV{HOME}/Downloads/simple-sentencepiece-0.7.tar.gz
12+
${CMAKE_SOURCE_DIR}/simple-sentencepiece-0.7.tar.gz
13+
${CMAKE_BINARY_DIR}/simple-sentencepiece-0.7.tar.gz
14+
/tmp/simple-sentencepiece-0.7.tar.gz
15+
/star-fj/fangjun/download/github/simple-sentencepiece-0.7.tar.gz
16+
)
17+
18+
foreach(f IN LISTS possible_file_locations)
19+
if(EXISTS ${f})
20+
set(simple-sentencepiece_URL "${f}")
21+
file(TO_CMAKE_PATH "${simple-sentencepiece_URL}" simple-sentencepiece_URL)
22+
message(STATUS "Found local downloaded simple-sentencepiece: ${simple-sentencepiece_URL}")
23+
set(simple-sentencepiece_URL2)
24+
break()
25+
endif()
26+
endforeach()
27+
28+
set(SBPE_ENABLE_TESTS OFF CACHE BOOL "" FORCE)
29+
set(SBPE_BUILD_PYTHON OFF CACHE BOOL "" FORCE)
30+
31+
FetchContent_Declare(simple-sentencepiece
32+
URL
33+
${simple-sentencepiece_URL}
34+
${simple-sentencepiece_URL2}
35+
URL_HASH
36+
${simple-sentencepiece_HASH}
37+
)
38+
39+
FetchContent_GetProperties(simple-sentencepiece)
40+
if(NOT simple-sentencepiece_POPULATED)
41+
message(STATUS "Downloading simple-sentencepiece ${simple-sentencepiece_URL}")
42+
FetchContent_Populate(simple-sentencepiece)
43+
endif()
44+
message(STATUS "simple-sentencepiece is downloaded to ${simple-sentencepiece_SOURCE_DIR}")
45+
add_subdirectory(${simple-sentencepiece_SOURCE_DIR} ${simple-sentencepiece_BINARY_DIR} EXCLUDE_FROM_ALL)
46+
47+
target_include_directories(ssentencepiece_core
48+
PUBLIC
49+
${simple-sentencepiece_SOURCE_DIR}/
50+
)
51+
52+
if(SHERPA_ONNX_ENABLE_PYTHON AND WIN32)
53+
install(TARGETS ssentencepiece_core DESTINATION ..)
54+
else()
55+
install(TARGETS ssentencepiece_core DESTINATION lib)
56+
endif()
57+
58+
if(WIN32 AND BUILD_SHARED_LIBS)
59+
install(TARGETS ssentencepiece_core DESTINATION bin)
60+
endif()
61+
endfunction()
62+
63+
download_simple_sentencepiece()

kotlin-api-examples/run.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ function testSpeakerEmbeddingExtractor() {
6060
function testOnlineAsr() {
6161
if [ ! -f ./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt ]; then
6262
git lfs install
63-
git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21
63+
GIT_CLONE_PROTECTION_ACTIVE=false git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21
6464
fi
6565

6666
if [ ! -f ./sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-80ms/tokens.txt ]; then

mfc-examples/NonStreamingSpeechRecognition/sherpa-onnx-deps.props

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
piper_phonemize.lib;
1919
espeak-ng.lib;
2020
ucd.lib;
21+
ssentencepiece_core.lib;
2122
</SherpaOnnxLibraries>
2223
</PropertyGroup>
2324
<ItemDefinitionGroup>

mfc-examples/NonStreamingTextToSpeech/sherpa-onnx-deps.props

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
piper_phonemize.lib;
1919
espeak-ng.lib;
2020
ucd.lib;
21+
ssentencepiece_core.lib;
2122
</SherpaOnnxLibraries>
2223
</PropertyGroup>
2324
<ItemDefinitionGroup>

mfc-examples/StreamingSpeechRecognition/sherpa-onnx-deps.props

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
piper_phonemize.lib;
1919
espeak-ng.lib;
2020
ucd.lib;
21+
ssentencepiece_core.lib;
2122
</SherpaOnnxLibraries>
2223
</PropertyGroup>
2324
<ItemDefinitionGroup>

python-api-examples/offline-decode-files.py

+27-5
Original file line numberDiff line numberDiff line change
@@ -110,11 +110,9 @@ def get_args():
110110
type=str,
111111
default="",
112112
help="""
113-
The file containing hotwords, one words/phrases per line, and for each
114-
phrase the bpe/cjkchar are separated by a space. For example:
115-
116-
▁HE LL O ▁WORLD
117-
你 好 世 界
113+
The file containing hotwords, one words/phrases per line, like
114+
HELLO WORLD
115+
你好世界
118116
""",
119117
)
120118

@@ -128,6 +126,28 @@ def get_args():
128126
""",
129127
)
130128

129+
parser.add_argument(
130+
"--modeling-unit",
131+
type=str,
132+
default="",
133+
help="""
134+
The modeling unit of the model, valid values are cjkchar, bpe, cjkchar+bpe.
135+
Used only when hotwords-file is given.
136+
""",
137+
)
138+
139+
parser.add_argument(
140+
"--bpe-vocab",
141+
type=str,
142+
default="",
143+
help="""
144+
The path to the bpe vocabulary, the bpe vocabulary is generated by
145+
sentencepiece, you can also export the bpe vocabulary through a bpe model
146+
by `scripts/export_bpe_vocab.py`. Used only when hotwords-file is given
147+
and modeling-unit is bpe or cjkchar+bpe.
148+
""",
149+
)
150+
131151
parser.add_argument(
132152
"--encoder",
133153
default="",
@@ -347,6 +367,8 @@ def main():
347367
decoding_method=args.decoding_method,
348368
hotwords_file=args.hotwords_file,
349369
hotwords_score=args.hotwords_score,
370+
modeling_unit=args.modeling_unit,
371+
bpe_vocab=args.bpe_vocab,
350372
blank_penalty=args.blank_penalty,
351373
debug=args.debug,
352374
)

python-api-examples/online-decode-files.py

+27-5
Original file line numberDiff line numberDiff line change
@@ -198,11 +198,9 @@ def get_args():
198198
type=str,
199199
default="",
200200
help="""
201-
The file containing hotwords, one words/phrases per line, and for each
202-
phrase the bpe/cjkchar are separated by a space. For example:
203-
204-
▁HE LL O ▁WORLD
205-
你 好 世 界
201+
The file containing hotwords, one words/phrases per line, like
202+
HELLO WORLD
203+
你好世界
206204
""",
207205
)
208206

@@ -216,6 +214,28 @@ def get_args():
216214
""",
217215
)
218216

217+
parser.add_argument(
218+
"--modeling-unit",
219+
type=str,
220+
default="",
221+
help="""
222+
The modeling unit of the model, valid values are cjkchar, bpe, cjkchar+bpe.
223+
Used only when hotwords-file is given.
224+
""",
225+
)
226+
227+
parser.add_argument(
228+
"--bpe-vocab",
229+
type=str,
230+
default="",
231+
help="""
232+
The path to the bpe vocabulary, the bpe vocabulary is generated by
233+
sentencepiece, you can also export the bpe vocabulary through a bpe model
234+
by `scripts/export_bpe_vocab.py`. Used only when hotwords-file is given
235+
and modeling-unit is bpe or cjkchar+bpe.
236+
""",
237+
)
238+
219239
parser.add_argument(
220240
"--blank-penalty",
221241
type=float,
@@ -302,6 +322,8 @@ def main():
302322
lm_scale=args.lm_scale,
303323
hotwords_file=args.hotwords_file,
304324
hotwords_score=args.hotwords_score,
325+
modeling_unit=args.modeling_unit,
326+
bpe_vocab=args.bpe_vocab,
305327
blank_penalty=args.blank_penalty,
306328
)
307329
elif args.zipformer2_ctc:

0 commit comments

Comments
 (0)