facebookresearch
diff --git a/‎.gitignore
+2-3 b/‎.gitignore
+2-3
diff --git a/‎README.md
+19-13 b/‎README.md
+19-13
diff --git a/‎config/roberta_stopwords.txt
+211 b/‎config/roberta_stopwords.txt
+211
diff --git a/‎dpr_scale/task/mlm_task.py
+6-5 b/‎dpr_scale/task/mlm_task.py
+6-5
@@ -5,19 +5,18 @@
 *vscode*
 Makefile
 *tmp*
-*.txt
 *.html
 *.out
 *.err
 *.log
 *.json
 *.npy
+my*
 task_data
 core
 data
 save
 corpus
+train_corpus
 deleted_files
-preprocess
-
 
@@ -16,6 +16,8 @@ This repo contains the original implementation of the paper "[Nonparametric Mask
 
 Models are available from Huggingface Hub:hugs:! Check out [**npm**](https://huggingface.co/facebook/npm) (for phrase retrieval) and [**npm-single**](https://huggingface.co/facebook/npm-single) (for token retrieval).
 
+**We are working on a simple demo where you can simply download all the resources and deploy on your machine. Stay tuned!**
+
 ### Updates
 * **01/02/2023**: The code for training is released. See [train.md](train.md) for instructions.
 * **12/22/2022**: The code for inference is released. Stay tuned for the code for training.
@@ -85,8 +87,8 @@ python -m scripts.prompt \
 
 ```bash
 # To run on AGN, Yahoo and RTE:
-bash scripts/save_embeddings.sh npm enwiki-0 false 384
-bash scripts/save_embeddings.sh npm cc_news false 384
+bash scripts/save_embeddings.sh npm enwiki-0 false 320
+bash scripts/save_embeddings.sh npm cc_news false 320
 python -m scripts.prompt \
     --corpus_data enwiki-0+cc_news \
     --checkpoint_path npm \
@@ -95,7 +97,7 @@ python -m scripts.prompt \
     --save_dir save/npm
 
 # To run on Subj:
-bash scripts/save_embeddings.sh npm subj false 384
+bash scripts/save_embeddings.sh npm subj false 320
 python -m scripts.prompt \
     --corpus_data subj \
     --checkpoint_path npm \
@@ -104,8 +106,8 @@ python -m scripts.prompt \
     --save_dir save/npm
 
 # To run on SST-2, MR, RT, CR and Amazon:
-bash scripts/save_embeddings.sh npm imdb false 384
-bash scripts/save_embeddings.sh npm amazon false 384
+bash scripts/save_embeddings.sh npm imdb false 320
+bash scripts/save_embeddings.sh npm amazon false 320
 python -m scripts.prompt \
     --corpus_data imdb+amazon \
     --checkpoint_path npm \
@@ -114,14 +116,19 @@ python -m scripts.prompt \
     --save_dir save/npm
 ```
 
-Note that `scripts/save_embeddings.sh` takes 'model name', 'corpus name', 'whether it is an open-set task' and `batch size` (`384` is good for a 32gb GPU) as arguments. Embeddings are saved under `save/{model_name}/dstore`.
+Note that `scripts/save_embeddings.sh` takes
+- model name (npm or npm-single)
+- corpus name
+- whether it is an open-set task (true or false)
+- batch size (`320` is good for a 32gb GPU; if `trainer.precision=16` is used, `400` is good for a 32gb GPU)
+as arguments. Embeddings are saved under `save/{model_name}/dstore`.
 
 #### NPM Single on closed-set tasks
 
 ```bash
 # To run on AGN, Yahoo and RTE:
-bash scripts/save_embeddings.sh npm-single enwiki-0 false 384
-bash scripts/save_embeddings.sh npm-single cc_news false 384
+bash scripts/save_embeddings.sh npm-single enwiki-0 false 320
+bash scripts/save_embeddings.sh npm-single cc_news false 320
 python -m scripts.prompt \
     --corpus_data enwiki-0+cc_news \
     --checkpoint_path npm-single \
@@ -130,9 +137,8 @@ python -m scripts.prompt \
     --single \
     --save_dir save/npm-single
 
-
 # To run on Subj:
-bash scripts/save_embeddings.sh npm-single subj false 384
+bash scripts/save_embeddings.sh npm-single subj false 320
 python -m scripts.prompt \
     --corpus_data subj \
     --checkpoint_path npm-single \
@@ -142,8 +148,8 @@ python -m scripts.prompt \
     --save_dir save/npm-single
 
 # To run on SST-2, MR, RT, CR and Amazon:
-bash scripts/save_embeddings.sh npm-single imdb false 384
-bash scripts/save_embeddings.sh npm-single amazon false 384
+bash scripts/save_embeddings.sh npm-single imdb false 320
+bash scripts/save_embeddings.sh npm-single amazon false 320
 python -m scripts.prompt \
     --corpus_data imdb+amazon \
     --checkpoint_path npm-single \
@@ -175,7 +181,7 @@ Please note that running open-set tasks requires around 70GB of RAM and 1.4TB of
 ```bash
 # Note that this can be executed in parallel with up to 20 GPUs. In total, it takes about 10 GPU hours and 1.4TB of disk memory.
 for i in {0..19} ; do
-    bash scripts/save_embeddings.sh npm enwiki-${i} true 384
+    bash scripts/save_embeddings.sh npm enwiki-${i} true 320
 done
 
 # Loading the model takes about 40min, and 70GB of RAM (specify `--keep_uint8` to reduce RAM usage to 40GB which increases the model loading time to 60-80min).
 
@@ -0,0 +1,211 @@
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+19
+21
+22
+23
+24
+25
+28
+30
+31
+32
+33
+34
+35
+36
+37
+39
+40
+41
+42
+43
+45
+47
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+61
+62
+63
+64
+66
+68
+69
+70
+71
+73
+77
+79
+81
+84
+87
+88
+89
+95
+97
+98
+99
+103
+106
+108
+109
+110
+111
+113
+114
+116
+122
+123
+127
+128
+129
+131
+136
+137
+141
+142
+143
+144
+145
+147
+148
+149
+150
+159
+160
+162
+167
+172
+182
+197
+207
+209
+215
+218
+222
+223
+227
+258
+259
+276
+308
+328
+349
+350
+351
+359
+367
+385
+399
+454
+456
+473
+475
+479
+519
+524
+579
+596
+608
+617
+630
+646
+683
+742
+769
+787
+849
+874
+938
+939
+947
+965
+1003
+1009
+1021
+1039
+1065
+1215
+1235
+1423
+1495
+1589
+1629
+1640
+1705
+1721
+1979
+2025
+2055
+2156
+2185
+2220
+2282
+2512
+2661
+2744
+2864
+3226
+3486
+3559
+4288
+4395
+4832
+4839
+5030
+5214
+5457
+5844
+7606
+8061
+9131
+10431
+10975
+12905
+14314
+14434
+15157
+15483
+15698
+17487
+18134
+18212
+19385
+20343
+22209
+23367
+24303
+25522
+25606
+27779
+27785
+28696
+31954
+34437
+35227
+35524
+37249
+37457
+41552
+44128
+45152
@@ -105,7 +105,6 @@ def setup(self, stage: str):
             state_dict = checkpoint_dict["state_dict"] if "state_dict" in checkpoint_dict else checkpoint_dict
             self.starting_global_step = checkpoint_dict["global_step"] if "global_step" in checkpoint_dict else 0
             self.load_state_dict(state_dict)
-
             print(f"Loaded state dict from {self.pretrained_checkpoint_path}")
         else:
             self.starting_global_step = 0
@@ -532,17 +531,20 @@ def _get_contrastive_loss(scores, labels, score_mask=score_mask, label_mask=labe
 
 class MaskedLanguageModelingEncodingTask(MaskedLanguageModelingTask):
 
-    def __init__(self, ctx_embeddings_dir, checkpoint_path=None, remove_stopwords=False, **kwargs):
+    def __init__(self, ctx_embeddings_dir, checkpoint_path=None, use_half_precision=True,
+                 remove_stopwords=False, stopwords_dir=None, **kwargs):
         super().__init__(**kwargs)
         self.ctx_embeddings_dir = ctx_embeddings_dir
         self.checkpoint_path = checkpoint_path
+        self.use_half_precision = use_half_precision
         pathlib.Path(ctx_embeddings_dir).mkdir(parents=True, exist_ok=True)
 
         self.remove_stopwords = remove_stopwords
 
         if self.remove_stopwords:
             stopwords = set()
-            stopwords_dir = "/".join(self.checkpoint_path.split("/")[:-3]) + "/config"
+            #assert stopwords_dir is not None
+            stopwords_dir = "/private/home/sewonmin/clean-token-retrieval/config"
             with open(os.path.join(stopwords_dir, "roberta_stopwords.txt")) as f:
                 for line in f:
                     stopwords.add(int(line.strip()))
@@ -572,7 +574,7 @@ def test_step(self, batch, batch_idx):
 
     def test_epoch_end(self, outputs):
         assert self.global_rank==0
-        use_half_precision = True
+        use_half_precision = self.use_half_precision
 
         if not self.ctx_embeddings_dir:
             self.ctx_embeddings_dir = self.trainer.weights_save_path
@@ -597,7 +599,6 @@ def _filter(curr_input_ids, curr_attention_mask, curr_is_valid, curr_hidden_stat
             for i, hidden_states in enumerate(curr_hidden_states):
                 if not curr_is_valid[i]:
                     continue
-                # if the current word is stopword, we don't have to save it
                 if self.remove_stopwords and curr_input_ids[i] in self.stopwords:
                     continue
                 vec.append(hidden_states)
-Original file line number
+Diff line change
 +4
 +5
 +6
 +7
 +8
 +9
 +10
 +11
 +12
 +13
 +14
 +15
 +16
 +19
 +21
 +22
 +23
 +24
 +25
 +28
 +30
 +31
 +32
 +33
 +34
 +35
 +36
 +37
 +39
 +40
 +41
 +42
 +43
 +45
 +47
 +49
 +50
 +51
 +52
 +53
 +54
 +55
 +56
 +57
 +58
 +59
 +61
 +62
 +63
 +64
 +66
 +68
 +69
 +70
 +71
 +73
 +77
 +79
 +81
 +84
 +87
 +88
 +89
 +95
 +97
 +98
 +99
 +103
 +106
 +108
 +109
 +110
 +111
 +113
 +114
 +116
 +122
 +123
 +127
 +128
 +129
 +131
 +136
 +137
 +141
 +142
 +143
 +144
 +145
 +147
 +148
 +149
 +150
 +159
 +160
 +162
 +167
 +172
 +182
 +197
 +207
 +209
 +215
 +218
 +222
 +223
 +227
 +258
 +259
 +276
 +308
 +328
 +349
 +350
 +351
 +359
 +367
 +385
 +399
 +454
 +456
 +473
 +475
 +479
 +519
 +524
 +579
 +596
 +608
 +617
 +630
 +646
 +683
 +742
 +769
 +787
 +849
 +874
 +938
 +939
 +947
 +965
 +1003
 +1009
 +1021
 +1039
 +1065
 +1215
 +1235
 +1423
 +1495
 +1589
 +1629
 +1640
 +1705
 +1721
 +1979
 +2025
 +2055
 +2156
 +2185
 +2220
 +2282
 +2512
 +2661
 +2744
 +2864
 +3226
 +3486
 +3559
 +4288
 +4395
 +4832
 +4839
 +5030
 +5214
 +5457
 +5844
 +7606
 +8061
 +9131
 +10431
 +10975
 +12905
 +14314
 +14434
 +15157
 +15483
 +15698
 +17487
 +18134
 +18212
 +19385
 +20343
 +22209
 +23367
 +24303
 +25522
 +25606
 +27779
 +27785
 +28696
 +31954
 +34437
 +35227
 +35524
 +37249
 +37457
 +41552
 +44128
 +45152