1
+ // Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
2
+ const mic = require ( 'mic' ) ; // It uses `mic` for better compatibility, do check its [npm](https://www.npmjs.com/package/mic) before running it.
3
+ const sherpa_onnx = require ( 'sherpa-onnx' ) ;
4
+
5
+ function createOnlineRecognizer ( ) {
6
+ let onlineParaformerModelConfig = {
7
+ encoder : './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx' ,
8
+ decoder : './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx' ,
9
+ } ;
10
+
11
+ let onlineModelConfig = {
12
+ paraformer : onlineParaformerModelConfig ,
13
+ tokens : './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt' ,
14
+ } ;
15
+
16
+ let recognizerConfig = {
17
+ modelConfig : onlineModelConfig ,
18
+ enableEndpoint : 1 ,
19
+ rule1MinTrailingSilence : 2.4 ,
20
+ rule2MinTrailingSilence : 1.2 ,
21
+ rule3MinUtteranceLength : 20 ,
22
+ } ;
23
+
24
+ return sherpa_onnx . createOnlineRecognizer ( recognizerConfig ) ;
25
+ }
26
+
27
+ /**
28
+ * SpeechSession class, work as a session manager with the formatOutput function
29
+ * Sample output:
30
+ === Automated Speech Recognition ===
31
+ Current Session #1
32
+ Time: 8:44:46 PM
33
+ ------------------------
34
+ Recognized Sentences:
35
+ [8:44:43 PM] 1. it's so great three result is great great 她还支持中文
36
+ [8:44:46 PM] 2. 很厉
37
+ ------------------------
38
+ Recognizing: 真的很厉害太厉害
39
+
40
+ */
41
+ class SpeechSession {
42
+ constructor ( ) {
43
+ this . startTime = Date . now ( ) ;
44
+ this . sentences = [ ] ;
45
+ this . currentText = '' ;
46
+ this . lastUpdateTime = Date . now ( ) ;
47
+ }
48
+
49
+ addOrUpdateText ( text ) {
50
+ this . currentText = text ;
51
+ this . lastUpdateTime = Date . now ( ) ;
52
+ }
53
+
54
+ finalizeSentence ( ) {
55
+ if ( this . currentText . trim ( ) ) {
56
+ this . sentences . push ( {
57
+ text : this . currentText . trim ( ) ,
58
+ timestamp : new Date ( ) . toLocaleTimeString ( )
59
+ } ) ;
60
+ }
61
+ this . currentText = '' ;
62
+ }
63
+
64
+ shouldStartNewSession ( ) {
65
+ return Date . now ( ) - this . lastUpdateTime > 10000 ; // 10 seconds of silence
66
+ }
67
+ }
68
+
69
+ function formatOutput ( ) {
70
+ clearConsole ( ) ;
71
+ console . log ( '\n=== Automated Speech Recognition ===' ) ;
72
+ console . log ( `Current Session #${ sessionCount } ` ) ;
73
+ console . log ( 'Time:' , new Date ( ) . toLocaleTimeString ( ) ) ;
74
+ console . log ( '------------------------' ) ;
75
+
76
+ // 显示历史句子
77
+ if ( currentSession . sentences . length > 0 ) {
78
+ console . log ( 'Recognized Sentences:' ) ;
79
+ currentSession . sentences . forEach ( ( sentence , index ) => {
80
+ console . log ( `[${ sentence . timestamp } ] ${ index + 1 } . ${ sentence . text } ` ) ;
81
+ } ) ;
82
+ console . log ( '------------------------' ) ;
83
+ }
84
+
85
+ // 显示当前正在识别的内容
86
+ if ( currentSession . currentText ) {
87
+ console . log ( 'Recognizing:' , currentSession . currentText ) ;
88
+ }
89
+ }
90
+
91
+
92
+ const recognizer = createOnlineRecognizer ( ) ;
93
+ const stream = recognizer . createStream ( ) ;
94
+ let currentSession = new SpeechSession ( ) ;
95
+ let sessionCount = 1 ;
96
+
97
+ function clearConsole ( ) {
98
+ process . stdout . write ( '\x1B[2J\x1B[0f' ) ;
99
+ }
100
+
101
+
102
+ function exitHandler ( options , exitCode ) {
103
+ if ( options . cleanup ) {
104
+ console . log ( '\nCleaned up resources...' ) ;
105
+ micInstance . stop ( ) ;
106
+ stream . free ( ) ;
107
+ recognizer . free ( ) ;
108
+ }
109
+ if ( exitCode || exitCode === 0 ) console . log ( 'Exit code:' , exitCode ) ;
110
+ if ( options . exit ) process . exit ( ) ;
111
+ }
112
+
113
+ const micInstance = mic ( {
114
+ rate : recognizer . config . featConfig . sampleRate ,
115
+ channels : 1 ,
116
+ debug : false , // 关闭调试输出
117
+ device : 'default' ,
118
+ bitwidth : 16 ,
119
+ encoding : 'signed-integer' ,
120
+ exitOnSilence : 6 ,
121
+ fileType : 'raw'
122
+ } ) ;
123
+
124
+ const micInputStream = micInstance . getAudioStream ( ) ;
125
+
126
+ function startMic ( ) {
127
+ return new Promise ( ( resolve , reject ) => {
128
+ micInputStream . once ( 'startComplete' , ( ) => {
129
+ console . log ( 'Mic phone started.' ) ;
130
+ resolve ( ) ;
131
+ } ) ;
132
+
133
+ micInputStream . once ( 'error' , ( err ) => {
134
+ console . error ( 'Mic phone start error:' , err ) ;
135
+ reject ( err ) ;
136
+ } ) ;
137
+
138
+ micInstance . start ( ) ;
139
+ } ) ;
140
+ }
141
+
142
+ micInputStream . on ( 'data' , buffer => {
143
+ const int16Array = new Int16Array ( buffer . buffer ) ;
144
+ const samples = new Float32Array ( int16Array . length ) ;
145
+
146
+ for ( let i = 0 ; i < int16Array . length ; i ++ ) {
147
+ samples [ i ] = int16Array [ i ] / 32768.0 ;
148
+ }
149
+
150
+ stream . acceptWaveform ( recognizer . config . featConfig . sampleRate , samples ) ;
151
+
152
+ while ( recognizer . isReady ( stream ) ) {
153
+ recognizer . decode ( stream ) ;
154
+ }
155
+
156
+ const isEndpoint = recognizer . isEndpoint ( stream ) ;
157
+ const text = recognizer . getResult ( stream ) . text ;
158
+
159
+ if ( text . length > 0 ) {
160
+ // 检查是否需要开始新会话
161
+ if ( currentSession . shouldStartNewSession ( ) ) {
162
+ currentSession . finalizeSentence ( ) ;
163
+ sessionCount ++ ;
164
+ currentSession = new SpeechSession ( ) ;
165
+ }
166
+
167
+ currentSession . addOrUpdateText ( text ) ;
168
+ formatOutput ( ) ;
169
+ }
170
+
171
+ if ( isEndpoint ) {
172
+ if ( text . length > 0 ) {
173
+ currentSession . finalizeSentence ( ) ;
174
+ formatOutput ( ) ;
175
+ }
176
+ recognizer . reset ( stream ) ;
177
+ }
178
+ } ) ;
179
+
180
+ micInputStream . on ( 'error' , err => {
181
+ console . error ( 'Audio stream error:' , err ) ;
182
+ } ) ;
183
+
184
+ micInputStream . on ( 'close' , ( ) => {
185
+ console . log ( 'Mic phone closed.' ) ;
186
+ } ) ;
187
+
188
+ process . on ( 'exit' , exitHandler . bind ( null , { cleanup : true } ) ) ;
189
+ process . on ( 'SIGINT' , exitHandler . bind ( null , { exit : true } ) ) ;
190
+ process . on ( 'SIGUSR1' , exitHandler . bind ( null , { exit : true } ) ) ;
191
+ process . on ( 'SIGUSR2' , exitHandler . bind ( null , { exit : true } ) ) ;
192
+ process . on ( 'uncaughtException' , exitHandler . bind ( null , { exit : true } ) ) ;
193
+
194
+ async function main ( ) {
195
+ try {
196
+ console . log ( 'Starting ...' ) ;
197
+ await startMic ( ) ;
198
+ console . log ( 'Initialized, waiting for speech ...' ) ;
199
+ formatOutput ( ) ;
200
+ } catch ( err ) {
201
+ console . error ( 'Failed to initialize:' , err ) ;
202
+ process . exit ( 1 ) ;
203
+ }
204
+ }
205
+
206
+ main ( ) ;
0 commit comments