-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_CMU_LM.py
43 lines (31 loc) · 1.26 KB
/
train_CMU_LM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
'''
Created on Dec 6, 2011
@author: akirk
'''
import subprocess, codecs
def train_lm(filename):
'''Trains a LM using CMULMTK. $filename is the relative path to the input
file.'''
name = filename.split(".")[0]
log = name + ".lm.log"
logfile = codecs.open(log, "w", "iso8859_15")
#Create vocabulary file
vocab = name + ".vocab"
subprocess.call("text2wfreq < %s | wfreq2vocab > %s" % (filename, vocab),
shell=True, stderr=logfile)
#Create idngram file
idngram = name + ".idngram"
cmd = "text2idngram -vocab %s -idngram %s < %s" % (vocab, idngram, filename)
subprocess.call(cmd, shell=True, stderr=logfile)
#Create ARPA format language model
arpa = name + ".arpa"
cmd = "idngram2lm -vocab_type 0 -idngram %s -vocab %s -arpa %s" % (idngram,
vocab,
arpa)
subprocess.call(cmd, shell=True, stderr=logfile)
#Create binary DMP file
binary = name + ".lm.DMP"
cmd = "sphinx_lm_convert -i %s -o %s" % (arpa, binary)
subprocess.call(cmd, shell=True, stderr=logfile)
if __name__ == '__main__':
train_lm("parole.lm.input")