@@ -30,6 +30,8 @@ def __init__(self, lexion_filename: str, tokens_filename: str):
30
30
tones = [int (t ) for t in tones ]
31
31
32
32
lexicon [word_or_phrase ] = (phones , tones )
33
+ lexicon ["呣" ] = lexicon ["母" ]
34
+ lexicon ["嗯" ] = lexicon ["恩" ]
33
35
self .lexicon = lexicon
34
36
35
37
punctuation = ["!" , "?" , "…" , "," , "." , "'" , "-" ]
@@ -98,20 +100,16 @@ def __init__(self, filename):
98
100
self .lang_id = int (meta ["lang_id" ])
99
101
self .sample_rate = int (meta ["sample_rate" ])
100
102
101
- def __call__ (self , x , tones , lang ):
103
+ def __call__ (self , x , tones ):
102
104
"""
103
105
Args:
104
106
x: 1-D int64 torch tensor
105
107
tones: 1-D int64 torch tensor
106
- lang: 1-D int64 torch tensor
107
108
"""
108
109
x = x .unsqueeze (0 )
109
110
tones = tones .unsqueeze (0 )
110
- lang = lang .unsqueeze (0 )
111
111
112
- print (x .shape , tones .shape , lang .shape )
113
- bert = torch .zeros (1 , self .bert_dim , x .shape [- 1 ])
114
- ja_bert = torch .zeros (1 , self .ja_bert_dim , x .shape [- 1 ])
112
+ print (x .shape , tones .shape )
115
113
sid = torch .tensor ([self .speaker_id ], dtype = torch .int64 )
116
114
noise_scale = torch .tensor ([0.6 ], dtype = torch .float32 )
117
115
length_scale = torch .tensor ([1.0 ], dtype = torch .float32 )
@@ -125,9 +123,6 @@ def __call__(self, x, tones, lang):
125
123
"x" : x .numpy (),
126
124
"x_lengths" : x_lengths .numpy (),
127
125
"tones" : tones .numpy (),
128
- "lang_id" : lang .numpy (),
129
- "bert" : bert .numpy (),
130
- "ja_bert" : ja_bert .numpy (),
131
126
"sid" : sid .numpy (),
132
127
"noise_scale" : noise_scale .numpy (),
133
128
"noise_scale_w" : noise_scale_w .numpy (),
@@ -140,34 +135,46 @@ def __call__(self, x, tones, lang):
140
135
def main ():
141
136
lexicon = Lexicon (lexion_filename = "./lexicon.txt" , tokens_filename = "./tokens.txt" )
142
137
143
- text = "永远相信,美好的事情即将发生。多音字测试, 银行,行不行?长沙长大 "
138
+ text = "永远相信,美好的事情即将发生。"
144
139
s = jieba .cut (text , HMM = True )
145
140
146
141
phones , tones = lexicon .convert (s )
147
142
143
+ en_text = "how are you ?" .split ()
144
+
145
+ phones_en , tones_en = lexicon .convert (en_text )
146
+ phones += [0 ]
147
+ tones += [0 ]
148
+
149
+ phones += phones_en
150
+ tones += tones_en
151
+
152
+ text = "多音字测试, 银行,行不行?长沙长大"
153
+ s = jieba .cut (text , HMM = True )
154
+
155
+ phones2 , tones2 = lexicon .convert (s )
156
+
157
+ phones += phones2
158
+ tones += tones2
159
+
148
160
model = OnnxModel ("./model.onnx" )
149
- langs = [model .lang_id ] * len (phones )
150
161
151
162
if model .add_blank :
152
163
new_phones = [0 ] * (2 * len (phones ) + 1 )
153
164
new_tones = [0 ] * (2 * len (tones ) + 1 )
154
- new_langs = [0 ] * (2 * len (langs ) + 1 )
155
165
156
166
new_phones [1 ::2 ] = phones
157
167
new_tones [1 ::2 ] = tones
158
- new_langs [1 ::2 ] = langs
159
168
160
169
phones = new_phones
161
170
tones = new_tones
162
- langs = new_langs
163
171
164
172
phones = torch .tensor (phones , dtype = torch .int64 )
165
173
tones = torch .tensor (tones , dtype = torch .int64 )
166
- langs = torch .tensor (langs , dtype = torch .int64 )
167
174
168
- print (phones .shape , tones .shape , langs . shape )
175
+ print (phones .shape , tones .shape )
169
176
170
- y = model (x = phones , tones = tones , lang = langs )
177
+ y = model (x = phones , tones = tones )
171
178
sf .write ("./test.wav" , y , model .sample_rate )
172
179
173
180
0 commit comments