Strip the lines before saving

liudong · liudong · commit 6c9431d02dde · 2016-05-20T14:15:07.000+01:00
diff --git a/glove2word2vec.py b/glove2word2vec.py
@@ -11,8 +11,8 @@
 
 model = gensim.models.Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False)  # C text format
 
-word2vec embeddings start with a line with the number of lines (tokens?) and the number of dimensions of the file. This allows gensim to allocate memory 
-accordingly for querying the model. Larger dimensions mean larger memory is held captive. Accordingly, this line has to be inserted into the GloVe 
+word2vec embeddings start with a line with the number of lines (tokens?) and the number of dimensions of the file. This allows gensim to allocate memory
+accordingly for querying the model. Larger dimensions mean larger memory is held captive. Accordingly, this line has to be inserted into the GloVe
 embeddings file.
 """
 
@@ -29,47 +29,47 @@
 
 def glove2word2vec(glove_vector_file, output_model_file):
     """Convert GloVe vectors into word2vec C format"""
-    
+
     def get_info(glove_file_name):
         """Return the number of vectors and dimensions in a file in GloVe format."""
-        with smart_open(glove_file_name) as f:
+        with smart_open.smart_open(glove_file_name) as f:
             num_lines = sum(1 for line in f)
-        with smart_open(glove_file_name) as f:
+        with smart_open.smart_open(glove_file_name) as f:
             num_dims = len(f.readline().split()) - 1
         return num_lines, num_dims
-    
+
     def prepend_line(infile, outfile, line):
-        """ 
+        """
         Function to prepend lines using smart_open
         """
-        with smart_open.smart_open(infile, ' rb ') as old:
-            with smart_open.smart_open(outfile, ' wb ') as new:
-                new.write(str(line) + " \n ")
+        with smart_open.smart_open(infile, 'rb') as old:
+            with smart_open.smart_open(outfile, 'wb') as new:
+                new.write(str(line.strip()) + "\n")
                 for line in old:
                     new.write(line)
         return outfile
-        
-    num_lines, dims= get_info(glove_vector_file)
-    
-    logger.info('%d lines with %s dimensions' %(num_lines, dims))
-    
+
+    num_lines, dims = get_info(glove_vector_file)
+
+    logger.info('%d lines with %s dimensions' % (num_lines, dims))
+
     gensim_first_line = "{} {}".format(num_lines, dims)
-    model_file=prepend_line(glove_vector_file, output_model_file, gensim_first_line)
-    
+    model_file = prepend_line(glove_vector_file, output_model_file, gensim_first_line)
+
     logger.info('Model %s successfully created !!'%output_model_file)
-    
-    # Demo: Loads the newly created glove_model.txt into gensim API.  
-    model=gensim.models.Word2Vec.load_word2vec_format(model_file, binary=False) #GloVe Model  
 
-    logger.info('Most similar to king are:%s '%model.most_similar(positive=['king'], topn=10))
-    logger.info('Similarity score between woman and man is %s ' %model.similarity('woman', 'man'))
-    
+    # Demo: Loads the newly created glove_model.txt into gensim API.
+    model = gensim.models.Word2Vec.load_word2vec_format(model_file, binary=False) #GloVe Model
+
+    logger.info('Most similar to king are: %s' % model.most_similar(positive=['king'], topn=10))
+    logger.info('Similarity score between woman and man is %s ' % model.similarity('woman', 'man'))
+
     logger.info("Finished running %s", program)
-    
+
     return model_file
 
 if __name__ == "__main__":
-    
+
     glove_vector_file=sys.argv[1]
     output_model_file=sys.argv[2]