|
11 | 11 |
|
12 | 12 | model = gensim.models.Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format
|
13 | 13 |
|
14 |
| -word2vec embeddings start with a line with the number of lines (tokens?) and the number of dimensions of the file. This allows gensim to allocate memory |
15 |
| -accordingly for querying the model. Larger dimensions mean larger memory is held captive. Accordingly, this line has to be inserted into the GloVe |
| 14 | +word2vec embeddings start with a line with the number of lines (tokens?) and the number of dimensions of the file. This allows gensim to allocate memory |
| 15 | +accordingly for querying the model. Larger dimensions mean larger memory is held captive. Accordingly, this line has to be inserted into the GloVe |
16 | 16 | embeddings file.
|
17 | 17 | """
|
18 | 18 |
|
|
29 | 29 |
|
30 | 30 | def glove2word2vec(glove_vector_file, output_model_file):
|
31 | 31 | """Convert GloVe vectors into word2vec C format"""
|
32 |
| - |
| 32 | + |
33 | 33 | def get_info(glove_file_name):
|
34 | 34 | """Return the number of vectors and dimensions in a file in GloVe format."""
|
35 |
| - with smart_open(glove_file_name) as f: |
| 35 | + with smart_open.smart_open(glove_file_name) as f: |
36 | 36 | num_lines = sum(1 for line in f)
|
37 |
| - with smart_open(glove_file_name) as f: |
| 37 | + with smart_open.smart_open(glove_file_name) as f: |
38 | 38 | num_dims = len(f.readline().split()) - 1
|
39 | 39 | return num_lines, num_dims
|
40 |
| - |
| 40 | + |
41 | 41 | def prepend_line(infile, outfile, line):
|
42 |
| - """ |
| 42 | + """ |
43 | 43 | Function to prepend lines using smart_open
|
44 | 44 | """
|
45 |
| - with smart_open.smart_open(infile, ' rb ') as old: |
46 |
| - with smart_open.smart_open(outfile, ' wb ') as new: |
47 |
| - new.write(str(line) + " \n ") |
| 45 | + with smart_open.smart_open(infile, 'rb') as old: |
| 46 | + with smart_open.smart_open(outfile, 'wb') as new: |
| 47 | + new.write(str(line.strip()) + "\n") |
48 | 48 | for line in old:
|
49 | 49 | new.write(line)
|
50 | 50 | return outfile
|
51 |
| - |
52 |
| - num_lines, dims= get_info(glove_vector_file) |
53 |
| - |
54 |
| - logger.info('%d lines with %s dimensions' %(num_lines, dims)) |
55 |
| - |
| 51 | + |
| 52 | + num_lines, dims = get_info(glove_vector_file) |
| 53 | + |
| 54 | + logger.info('%d lines with %s dimensions' % (num_lines, dims)) |
| 55 | + |
56 | 56 | gensim_first_line = "{} {}".format(num_lines, dims)
|
57 |
| - model_file=prepend_line(glove_vector_file, output_model_file, gensim_first_line) |
58 |
| - |
| 57 | + model_file = prepend_line(glove_vector_file, output_model_file, gensim_first_line) |
| 58 | + |
59 | 59 | logger.info('Model %s successfully created !!'%output_model_file)
|
60 |
| - |
61 |
| - # Demo: Loads the newly created glove_model.txt into gensim API. |
62 |
| - model=gensim.models.Word2Vec.load_word2vec_format(model_file, binary=False) #GloVe Model |
63 | 60 |
|
64 |
| - logger.info('Most similar to king are:%s '%model.most_similar(positive=['king'], topn=10)) |
65 |
| - logger.info('Similarity score between woman and man is %s ' %model.similarity('woman', 'man')) |
66 |
| - |
| 61 | + # Demo: Loads the newly created glove_model.txt into gensim API. |
| 62 | + model = gensim.models.Word2Vec.load_word2vec_format(model_file, binary=False) #GloVe Model |
| 63 | + |
| 64 | + logger.info('Most similar to king are: %s' % model.most_similar(positive=['king'], topn=10)) |
| 65 | + logger.info('Similarity score between woman and man is %s ' % model.similarity('woman', 'man')) |
| 66 | + |
67 | 67 | logger.info("Finished running %s", program)
|
68 |
| - |
| 68 | + |
69 | 69 | return model_file
|
70 | 70 |
|
71 | 71 | if __name__ == "__main__":
|
72 |
| - |
| 72 | + |
73 | 73 | glove_vector_file=sys.argv[1]
|
74 | 74 | output_model_file=sys.argv[2]
|
75 | 75 |
|
|
0 commit comments