-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcompute_sense_embeddings.py
80 lines (57 loc) · 2.57 KB
/
compute_sense_embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""Compute sense embeddings
Assumption is that:
a) configs/main.json contains main experiments settings
b) configs/<exp>.json contains configuration for the experiment
Usage:
compute_sense_embeddings.py --exp=<exp>
Example:
python compute_sense_embeddings.py --exp='synset---se13---semcor'
Options:
-h --help Show this screen.
--exp=<exp> the name of the experiment
"""
import json
import pandas
import os
from docopt import docopt
from collections import defaultdict
import tensorflow as tf
from wsd_class import WsdLstm
arguments = docopt(__doc__)
main_config = json.load(open('configs/main.json'))
path_exp_config = '%s/%s/settings.json' % (main_config['experiments_folder'],
arguments['--exp'])
exp_config = json.load(open(path_exp_config))
meaning2context_embds = defaultdict(list)
with tf.Session() as sess: # your session object
wsd_lstm_obj = WsdLstm(model_path=main_config['model_path'],
vocab_path=main_config['vocab_path'],
sess=sess)
num_target_embeddings = 0
for instance_id, \
target_index, \
annotation, \
target_embedding in wsd_lstm_obj.apply_on_lstm_input_file(sess=sess,
lstm_input_path=exp_config['lstm_input'],
batch_size=exp_config['batch_size']):
meaning2context_embds[annotation].append((instance_id, target_index, target_embedding))
num_target_embeddings += 1
stats = pandas.read_pickle(exp_config['annotated_data_stats'])
meaning2avg_embedding = dict()
for meaning, embeddings in meaning2context_embds.items():
total = stats[meaning]['total']
found = len(embeddings)
assert total == found, '%s (%s vs %s)' % (meaning, total, found)
total = sum([embedding[2]
for embedding in embeddings])
average = total / len(embeddings)
if len(embeddings) == 1:
assert all(average == embeddings[0][2])
meaning2avg_embedding[meaning] = average
pandas.to_pickle(meaning2avg_embedding, exp_config['meanings_path'])
pandas.to_pickle(meaning2context_embds, exp_config['meaning_instances_path'])
sense_embedding_stats_path = os.path.join(exp_config['exp_output_folder'],
'meaning_embeddings_stats.txt')
with open(sense_embedding_stats_path, 'w') as outfile:
outfile.write('number of target embeddings: %s\n' % num_target_embeddings)
outfile.write('number of meanings: %s\n' % len(meaning2avg_embedding))