@@ -37,15 +37,15 @@ struct vocab_word {
37
37
char train_file [MAX_STRING ], output_file [MAX_STRING ];
38
38
char save_vocab_file [MAX_STRING ], read_vocab_file [MAX_STRING ];
39
39
struct vocab_word * vocab ;
40
- int binary = 0 , cbow = 0 , debug_mode = 2 , window = 5 , min_count = 5 , num_threads = 1 , min_reduce = 1 ;
40
+ int binary = 0 , cbow = 1 , debug_mode = 2 , window = 5 , min_count = 5 , num_threads = 12 , min_reduce = 1 ;
41
41
int * vocab_hash ;
42
42
long long vocab_max_size = 1000 , vocab_size = 0 , layer1_size = 100 ;
43
- long long train_words = 0 , word_count_actual = 0 , file_size = 0 , classes = 0 ;
44
- real alpha = 0.025 , starting_alpha , sample = 0 ;
43
+ long long train_words = 0 , word_count_actual = 0 , iter = 5 , file_size = 0 , classes = 0 ;
44
+ real alpha = 0.025 , starting_alpha , sample = 1e-3 ;
45
45
real * syn0 , * syn1 , * syn1neg , * expTable ;
46
46
clock_t start ;
47
47
48
- int hs = 1 , negative = 0 ;
48
+ int hs = 0 , negative = 5 ;
49
49
const int table_size = 1e8 ;
50
50
int * table ;
51
51
@@ -337,29 +337,32 @@ void ReadVocab() {
337
337
338
338
void InitNet () {
339
339
long long a , b ;
340
+ unsigned long long next_random = 1 ;
340
341
a = posix_memalign ((void * * )& syn0 , 128 , (long long )vocab_size * layer1_size * sizeof (real ));
341
342
if (syn0 == NULL ) {printf ("Memory allocation failed\n" ); exit (1 );}
342
343
if (hs ) {
343
344
a = posix_memalign ((void * * )& syn1 , 128 , (long long )vocab_size * layer1_size * sizeof (real ));
344
345
if (syn1 == NULL ) {printf ("Memory allocation failed\n" ); exit (1 );}
345
- for (b = 0 ; b < layer1_size ; b ++ ) for (a = 0 ; a < vocab_size ; a ++ )
346
+ for (a = 0 ; a < vocab_size ; a ++ ) for (b = 0 ; b < layer1_size ; b ++ )
346
347
syn1 [a * layer1_size + b ] = 0 ;
347
348
}
348
349
if (negative > 0 ) {
349
350
a = posix_memalign ((void * * )& syn1neg , 128 , (long long )vocab_size * layer1_size * sizeof (real ));
350
351
if (syn1neg == NULL ) {printf ("Memory allocation failed\n" ); exit (1 );}
351
- for (b = 0 ; b < layer1_size ; b ++ ) for (a = 0 ; a < vocab_size ; a ++ )
352
+ for (a = 0 ; a < vocab_size ; a ++ ) for (b = 0 ; b < layer1_size ; b ++ )
352
353
syn1neg [a * layer1_size + b ] = 0 ;
353
354
}
354
- for (b = 0 ; b < layer1_size ; b ++ ) for (a = 0 ; a < vocab_size ; a ++ )
355
- syn0 [a * layer1_size + b ] = (rand () / (real )RAND_MAX - 0.5 ) / layer1_size ;
355
+ for (a = 0 ; a < vocab_size ; a ++ ) for (b = 0 ; b < layer1_size ; b ++ ) {
356
+ next_random = next_random * (unsigned long long )25214903917 + 11 ;
357
+ syn0 [a * layer1_size + b ] = (((next_random & 0xFFFF ) / (real )65536 ) - 0.5 ) / layer1_size ;
358
+ }
356
359
CreateBinaryTree ();
357
360
}
358
361
359
362
void * TrainModelThread (void * id ) {
360
- long long a , b , d , word , last_word , sentence_length = 0 , sentence_position = 0 ;
363
+ long long a , b , d , cw , word , last_word , sentence_length = 0 , sentence_position = 0 ;
361
364
long long word_count = 0 , last_word_count = 0 , sen [MAX_SENTENCE_LENGTH + 1 ];
362
- long long l1 , l2 , c , target , label ;
365
+ long long l1 , l2 , c , target , label , local_iter = iter ;
363
366
unsigned long long next_random = (long long )id ;
364
367
real f , g ;
365
368
clock_t now ;
@@ -374,11 +377,11 @@ void *TrainModelThread(void *id) {
374
377
if ((debug_mode > 1 )) {
375
378
now = clock ();
376
379
printf ("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk " , 13 , alpha ,
377
- word_count_actual / (real )(train_words + 1 ) * 100 ,
380
+ word_count_actual / (real )(iter * train_words + 1 ) * 100 ,
378
381
word_count_actual / ((real )(now - start + 1 ) / (real )CLOCKS_PER_SEC * 1000 ));
379
382
fflush (stdout );
380
383
}
381
- alpha = starting_alpha * (1 - word_count_actual / (real )(train_words + 1 ));
384
+ alpha = starting_alpha * (1 - word_count_actual / (real )(iter * train_words + 1 ));
382
385
if (alpha < starting_alpha * 0.0001 ) alpha = starting_alpha * 0.0001 ;
383
386
}
384
387
if (sentence_length == 0 ) {
@@ -400,8 +403,16 @@ void *TrainModelThread(void *id) {
400
403
}
401
404
sentence_position = 0 ;
402
405
}
403
- if (feof (fi )) break ;
404
- if (word_count > train_words / num_threads ) break ;
406
+ if (feof (fi ) || (word_count > train_words / num_threads )) {
407
+ word_count_actual += word_count - last_word_count ;
408
+ local_iter -- ;
409
+ if (local_iter == 0 ) break ;
410
+ word_count = 0 ;
411
+ last_word_count = 0 ;
412
+ sentence_length = 0 ;
413
+ fseek (fi , file_size / (long long )num_threads * (long long )id , SEEK_SET );
414
+ continue ;
415
+ }
405
416
word = sen [sentence_position ];
406
417
if (word == -1 ) continue ;
407
418
for (c = 0 ; c < layer1_size ; c ++ ) neu1 [c ] = 0 ;
@@ -410,58 +421,63 @@ void *TrainModelThread(void *id) {
410
421
b = next_random % window ;
411
422
if (cbow ) { //train the cbow architecture
412
423
// in -> hidden
424
+ cw = 0 ;
413
425
for (a = b ; a < window * 2 + 1 - b ; a ++ ) if (a != window ) {
414
426
c = sentence_position - window + a ;
415
427
if (c < 0 ) continue ;
416
428
if (c >= sentence_length ) continue ;
417
429
last_word = sen [c ];
418
430
if (last_word == -1 ) continue ;
419
431
for (c = 0 ; c < layer1_size ; c ++ ) neu1 [c ] += syn0 [c + last_word * layer1_size ];
432
+ cw ++ ;
420
433
}
421
- if (hs ) for (d = 0 ; d < vocab [word ].codelen ; d ++ ) {
422
- f = 0 ;
423
- l2 = vocab [word ].point [d ] * layer1_size ;
424
- // Propagate hidden -> output
425
- for (c = 0 ; c < layer1_size ; c ++ ) f += neu1 [c ] * syn1 [c + l2 ];
426
- if (f <= - MAX_EXP ) continue ;
427
- else if (f >= MAX_EXP ) continue ;
428
- else f = expTable [(int )((f + MAX_EXP ) * (EXP_TABLE_SIZE / MAX_EXP / 2 ))];
429
- // 'g' is the gradient multiplied by the learning rate
430
- g = (1 - vocab [word ].code [d ] - f ) * alpha ;
431
- // Propagate errors output -> hidden
432
- for (c = 0 ; c < layer1_size ; c ++ ) neu1e [c ] += g * syn1 [c + l2 ];
433
- // Learn weights hidden -> output
434
- for (c = 0 ; c < layer1_size ; c ++ ) syn1 [c + l2 ] += g * neu1 [c ];
435
- }
436
- // NEGATIVE SAMPLING
437
- if (negative > 0 ) for (d = 0 ; d < negative + 1 ; d ++ ) {
438
- if (d == 0 ) {
439
- target = word ;
440
- label = 1 ;
441
- } else {
442
- next_random = next_random * (unsigned long long )25214903917 + 11 ;
443
- target = table [(next_random >> 16 ) % table_size ];
444
- if (target == 0 ) target = next_random % (vocab_size - 1 ) + 1 ;
445
- if (target == word ) continue ;
446
- label = 0 ;
434
+ if (cw ) {
435
+ for (c = 0 ; c < layer1_size ; c ++ ) neu1 [c ] /= cw ;
436
+ if (hs ) for (d = 0 ; d < vocab [word ].codelen ; d ++ ) {
437
+ f = 0 ;
438
+ l2 = vocab [word ].point [d ] * layer1_size ;
439
+ // Propagate hidden -> output
440
+ for (c = 0 ; c < layer1_size ; c ++ ) f += neu1 [c ] * syn1 [c + l2 ];
441
+ if (f <= - MAX_EXP ) continue ;
442
+ else if (f >= MAX_EXP ) continue ;
443
+ else f = expTable [(int )((f + MAX_EXP ) * (EXP_TABLE_SIZE / MAX_EXP / 2 ))];
444
+ // 'g' is the gradient multiplied by the learning rate
445
+ g = (1 - vocab [word ].code [d ] - f ) * alpha ;
446
+ // Propagate errors output -> hidden
447
+ for (c = 0 ; c < layer1_size ; c ++ ) neu1e [c ] += g * syn1 [c + l2 ];
448
+ // Learn weights hidden -> output
449
+ for (c = 0 ; c < layer1_size ; c ++ ) syn1 [c + l2 ] += g * neu1 [c ];
450
+ }
451
+ // NEGATIVE SAMPLING
452
+ if (negative > 0 ) for (d = 0 ; d < negative + 1 ; d ++ ) {
453
+ if (d == 0 ) {
454
+ target = word ;
455
+ label = 1 ;
456
+ } else {
457
+ next_random = next_random * (unsigned long long )25214903917 + 11 ;
458
+ target = table [(next_random >> 16 ) % table_size ];
459
+ if (target == 0 ) target = next_random % (vocab_size - 1 ) + 1 ;
460
+ if (target == word ) continue ;
461
+ label = 0 ;
462
+ }
463
+ l2 = target * layer1_size ;
464
+ f = 0 ;
465
+ for (c = 0 ; c < layer1_size ; c ++ ) f += neu1 [c ] * syn1neg [c + l2 ];
466
+ if (f > MAX_EXP ) g = (label - 1 ) * alpha ;
467
+ else if (f < - MAX_EXP ) g = (label - 0 ) * alpha ;
468
+ else g = (label - expTable [(int )((f + MAX_EXP ) * (EXP_TABLE_SIZE / MAX_EXP / 2 ))]) * alpha ;
469
+ for (c = 0 ; c < layer1_size ; c ++ ) neu1e [c ] += g * syn1neg [c + l2 ];
470
+ for (c = 0 ; c < layer1_size ; c ++ ) syn1neg [c + l2 ] += g * neu1 [c ];
471
+ }
472
+ // hidden -> in
473
+ for (a = b ; a < window * 2 + 1 - b ; a ++ ) if (a != window ) {
474
+ c = sentence_position - window + a ;
475
+ if (c < 0 ) continue ;
476
+ if (c >= sentence_length ) continue ;
477
+ last_word = sen [c ];
478
+ if (last_word == -1 ) continue ;
479
+ for (c = 0 ; c < layer1_size ; c ++ ) syn0 [c + last_word * layer1_size ] += neu1e [c ];
447
480
}
448
- l2 = target * layer1_size ;
449
- f = 0 ;
450
- for (c = 0 ; c < layer1_size ; c ++ ) f += neu1 [c ] * syn1neg [c + l2 ];
451
- if (f > MAX_EXP ) g = (label - 1 ) * alpha ;
452
- else if (f < - MAX_EXP ) g = (label - 0 ) * alpha ;
453
- else g = (label - expTable [(int )((f + MAX_EXP ) * (EXP_TABLE_SIZE / MAX_EXP / 2 ))]) * alpha ;
454
- for (c = 0 ; c < layer1_size ; c ++ ) neu1e [c ] += g * syn1neg [c + l2 ];
455
- for (c = 0 ; c < layer1_size ; c ++ ) syn1neg [c + l2 ] += g * neu1 [c ];
456
- }
457
- // hidden -> in
458
- for (a = b ; a < window * 2 + 1 - b ; a ++ ) if (a != window ) {
459
- c = sentence_position - window + a ;
460
- if (c < 0 ) continue ;
461
- if (c >= sentence_length ) continue ;
462
- last_word = sen [c ];
463
- if (last_word == -1 ) continue ;
464
- for (c = 0 ; c < layer1_size ; c ++ ) syn0 [c + last_word * layer1_size ] += neu1e [c ];
465
481
}
466
482
} else { //train skip-gram
467
483
for (a = b ; a < window * 2 + 1 - b ; a ++ ) if (a != window ) {
@@ -611,7 +627,7 @@ int ArgPos(char *str, int argc, char **argv) {
611
627
int main (int argc , char * * argv ) {
612
628
int i ;
613
629
if (argc == 1 ) {
614
- printf ("WORD VECTOR estimation toolkit v 0.1b \n\n" );
630
+ printf ("WORD VECTOR estimation toolkit v 0.1c \n\n" );
615
631
printf ("Options:\n" );
616
632
printf ("Parameters for training:\n" );
617
633
printf ("\t-train <file>\n" );
@@ -623,18 +639,20 @@ int main(int argc, char **argv) {
623
639
printf ("\t-window <int>\n" );
624
640
printf ("\t\tSet max skip length between words; default is 5\n" );
625
641
printf ("\t-sample <float>\n" );
626
- printf ("\t\tSet threshold for occurrence of words. Those that appear with higher frequency" );
627
- printf (" in the training data will be randomly down-sampled; default is 0 (off) , useful value is 1e-5\n" );
642
+ printf ("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n " );
643
+ printf ("\t\twill be randomly down-sampled; default is 1e-3 , useful range is (0, 1e-5) \n" );
628
644
printf ("\t-hs <int>\n" );
629
- printf ("\t\tUse Hierarchical Softmax; default is 1 (0 = not used)\n" );
645
+ printf ("\t\tUse Hierarchical Softmax; default is 0 ( not used)\n" );
630
646
printf ("\t-negative <int>\n" );
631
- printf ("\t\tNumber of negative examples; default is 0 , common values are 5 - 10 (0 = not used)\n" );
647
+ printf ("\t\tNumber of negative examples; default is 5 , common values are 3 - 10 (0 = not used)\n" );
632
648
printf ("\t-threads <int>\n" );
633
- printf ("\t\tUse <int> threads (default 1)\n" );
649
+ printf ("\t\tUse <int> threads (default 12)\n" );
650
+ printf ("\t-iter <int>\n" );
651
+ printf ("\t\tRun more training iterations (default 5)\n" );
634
652
printf ("\t-min-count <int>\n" );
635
653
printf ("\t\tThis will discard words that appear less than <int> times; default is 5\n" );
636
654
printf ("\t-alpha <float>\n" );
637
- printf ("\t\tSet the starting learning rate; default is 0.025\n" );
655
+ printf ("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW \n" );
638
656
printf ("\t-classes <int>\n" );
639
657
printf ("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n" );
640
658
printf ("\t-debug <int>\n" );
@@ -646,9 +664,9 @@ int main(int argc, char **argv) {
646
664
printf ("\t-read-vocab <file>\n" );
647
665
printf ("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n" );
648
666
printf ("\t-cbow <int>\n" );
649
- printf ("\t\tUse the continuous bag of words model; default is 0 ( skip-gram model)\n" );
667
+ printf ("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n" );
650
668
printf ("\nExamples:\n" );
651
- printf ("./word2vec -train data.txt -output vec.txt -debug 2 - size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1\n\n" );
669
+ printf ("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 \n\n" );
652
670
return 0 ;
653
671
}
654
672
output_file [0 ] = 0 ;
@@ -661,13 +679,15 @@ int main(int argc, char **argv) {
661
679
if ((i = ArgPos ((char * )"-debug" , argc , argv )) > 0 ) debug_mode = atoi (argv [i + 1 ]);
662
680
if ((i = ArgPos ((char * )"-binary" , argc , argv )) > 0 ) binary = atoi (argv [i + 1 ]);
663
681
if ((i = ArgPos ((char * )"-cbow" , argc , argv )) > 0 ) cbow = atoi (argv [i + 1 ]);
682
+ if (cbow ) alpha = 0.05 ;
664
683
if ((i = ArgPos ((char * )"-alpha" , argc , argv )) > 0 ) alpha = atof (argv [i + 1 ]);
665
684
if ((i = ArgPos ((char * )"-output" , argc , argv )) > 0 ) strcpy (output_file , argv [i + 1 ]);
666
685
if ((i = ArgPos ((char * )"-window" , argc , argv )) > 0 ) window = atoi (argv [i + 1 ]);
667
686
if ((i = ArgPos ((char * )"-sample" , argc , argv )) > 0 ) sample = atof (argv [i + 1 ]);
668
687
if ((i = ArgPos ((char * )"-hs" , argc , argv )) > 0 ) hs = atoi (argv [i + 1 ]);
669
688
if ((i = ArgPos ((char * )"-negative" , argc , argv )) > 0 ) negative = atoi (argv [i + 1 ]);
670
689
if ((i = ArgPos ((char * )"-threads" , argc , argv )) > 0 ) num_threads = atoi (argv [i + 1 ]);
690
+ if ((i = ArgPos ((char * )"-iter" , argc , argv )) > 0 ) iter = atoi (argv [i + 1 ]);
671
691
if ((i = ArgPos ((char * )"-min-count" , argc , argv )) > 0 ) min_count = atoi (argv [i + 1 ]);
672
692
if ((i = ArgPos ((char * )"-classes" , argc , argv )) > 0 ) classes = atoi (argv [i + 1 ]);
673
693
vocab = (struct vocab_word * )calloc (vocab_max_size , sizeof (struct vocab_word ));
@@ -679,4 +699,4 @@ int main(int argc, char **argv) {
679
699
}
680
700
TrainModel ();
681
701
return 0 ;
682
- }
702
+ }
0 commit comments