Skip to content

Commit 6c9431d

Browse files
committed
Strip the lines before saving
1 parent 9536307 commit 6c9431d

File tree

1 file changed

+25
-25
lines changed

1 file changed

+25
-25
lines changed

glove2word2vec.py

+25-25
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
1212
model = gensim.models.Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format
1313
14-
word2vec embeddings start with a line with the number of lines (tokens?) and the number of dimensions of the file. This allows gensim to allocate memory
15-
accordingly for querying the model. Larger dimensions mean larger memory is held captive. Accordingly, this line has to be inserted into the GloVe
14+
word2vec embeddings start with a line with the number of lines (tokens?) and the number of dimensions of the file. This allows gensim to allocate memory
15+
accordingly for querying the model. Larger dimensions mean larger memory is held captive. Accordingly, this line has to be inserted into the GloVe
1616
embeddings file.
1717
"""
1818

@@ -29,47 +29,47 @@
2929

3030
def glove2word2vec(glove_vector_file, output_model_file):
3131
"""Convert GloVe vectors into word2vec C format"""
32-
32+
3333
def get_info(glove_file_name):
3434
"""Return the number of vectors and dimensions in a file in GloVe format."""
35-
with smart_open(glove_file_name) as f:
35+
with smart_open.smart_open(glove_file_name) as f:
3636
num_lines = sum(1 for line in f)
37-
with smart_open(glove_file_name) as f:
37+
with smart_open.smart_open(glove_file_name) as f:
3838
num_dims = len(f.readline().split()) - 1
3939
return num_lines, num_dims
40-
40+
4141
def prepend_line(infile, outfile, line):
42-
"""
42+
"""
4343
Function to prepend lines using smart_open
4444
"""
45-
with smart_open.smart_open(infile, ' rb ') as old:
46-
with smart_open.smart_open(outfile, ' wb ') as new:
47-
new.write(str(line) + " \n ")
45+
with smart_open.smart_open(infile, 'rb') as old:
46+
with smart_open.smart_open(outfile, 'wb') as new:
47+
new.write(str(line.strip()) + "\n")
4848
for line in old:
4949
new.write(line)
5050
return outfile
51-
52-
num_lines, dims= get_info(glove_vector_file)
53-
54-
logger.info('%d lines with %s dimensions' %(num_lines, dims))
55-
51+
52+
num_lines, dims = get_info(glove_vector_file)
53+
54+
logger.info('%d lines with %s dimensions' % (num_lines, dims))
55+
5656
gensim_first_line = "{} {}".format(num_lines, dims)
57-
model_file=prepend_line(glove_vector_file, output_model_file, gensim_first_line)
58-
57+
model_file = prepend_line(glove_vector_file, output_model_file, gensim_first_line)
58+
5959
logger.info('Model %s successfully created !!'%output_model_file)
60-
61-
# Demo: Loads the newly created glove_model.txt into gensim API.
62-
model=gensim.models.Word2Vec.load_word2vec_format(model_file, binary=False) #GloVe Model
6360

64-
logger.info('Most similar to king are:%s '%model.most_similar(positive=['king'], topn=10))
65-
logger.info('Similarity score between woman and man is %s ' %model.similarity('woman', 'man'))
66-
61+
# Demo: Loads the newly created glove_model.txt into gensim API.
62+
model = gensim.models.Word2Vec.load_word2vec_format(model_file, binary=False) #GloVe Model
63+
64+
logger.info('Most similar to king are: %s' % model.most_similar(positive=['king'], topn=10))
65+
logger.info('Similarity score between woman and man is %s ' % model.similarity('woman', 'man'))
66+
6767
logger.info("Finished running %s", program)
68-
68+
6969
return model_file
7070

7171
if __name__ == "__main__":
72-
72+
7373
glove_vector_file=sys.argv[1]
7474
output_model_file=sys.argv[2]
7575

0 commit comments

Comments
 (0)