minor fixes

Sewon Min · Sewon Min · commit 4680235e81a6 · 2023-01-01T09:40:25.000+09:00
diff --git a/dpr_scale/models/hf_encoder.py b/dpr_scale/models/hf_encoder.py
@@ -10,7 +10,7 @@
 from dpr_scale.utils.utils import PathManager
 
 # @manual=//python/wheel/transformers3:transformers3
-from transformers import AutoModelForMaskedLM, AutoConfig
+from transformers import RobertaForMaskedLM, AutoModelForMaskedLM, AutoConfig
 
 class Encoder(nn.Module):
     def __init__(
@@ -40,7 +40,7 @@ def __init__(
             self.transformer = AutoModelForMaskedLM.from_pretrained(local_model_path, config=cfg)
             print ("Initializing from", local_model_path)
         else:
-            self.transformer = AutoModelForMaskedLM.from_pretrained(config=cfg)
+            self.transformer = RobertaForMaskedLM(config=cfg)
 
     def forward(self, tokens):
         return self.transformer(**tokens, return_dict=True)
diff --git a/npm/dstore.py b/npm/dstore.py
@@ -51,6 +51,7 @@ def __init__(self,
                  probe=8,
                  num_keys_to_add_at_a_time=1000000,
                  remove_stopwords=False,
+                 remove_stopwords_except_k=None,
                  restricted=None,
                  consider_string_boundary=True,
                  cuda=True,
@@ -64,7 +65,7 @@ def __init__(self,
             if model_dir is not None:
                 model_dir = os.path.join(model_dir, setting)
         elif setting in ["enwiki", "enwiki-2022"]:
-            assert remove_stopwords, remove_stopwords
+            assert remove_stopwords or remove_stopwords_except_k
             data_path=[os.path.join(base_dir, setting, "{}.npy".format(idx)) for idx in range(20)]
             if model_dir is not None:
                 model_dir=[os.path.join(model_dir, "{}-{}".format(setting, idx)) for idx in range(20)]
@@ -76,13 +77,16 @@ def __init__(self,
         else:
             raise NotImplementedError(setting)
 
+        assert not (remove_stopwords and remove_stopwords_except_k)
+
         self.setting = setting
         self.dimension = dimension
         self.ncentroids = ncentroids
         self.code_size = code_size
         self.probe = probe
         self.num_keys_to_add_at_a_time = num_keys_to_add_at_a_time
         self.remove_stopwords = remove_stopwords
+        self.remove_stopwords_except_k = remove_stopwords_except_k
         self.restricted = restricted
         self.consider_string_boundary = consider_string_boundary
         self.cuda = cuda
@@ -127,7 +131,7 @@ def __init__(self,
             self.load_index(model_dir)
 
     def load_stopwords(self):
-        if self.remove_stopwords:
+        if self.remove_stopwords or self.remove_stopwords_except_k:
             stopwords = set()
             stopwords_dir = "/private/home/sewonmin/token-retrieval/task_data"
             with open(os.path.join(stopwords_dir, "roberta_stopwords.txt")) as f:
@@ -178,6 +182,9 @@ def load_data(self, data_path):
             true_dstore_size = 0
             offset_block = 0 if self.input_ids is None else len(self.input_ids)
 
+            remove_stopwords = self.remove_stopwords or (
+                self.remove_stopwords_except_k is not None and data_path_idx >= self.remove_stopwords_except_k)
+
             for block_idx, (valid_start, valid_end) in enumerate(tqdm(valid_candidates)):
                 start = start_end_pairs[block_idx]
                 end = start_end_pairs[block_idx+1] if block_idx<len(start_end_pairs)-1 else len(input_ids)
@@ -189,11 +196,11 @@ def load_data(self, data_path):
                 curr_dstore_size = 0
 
                 for i, curr_token in enumerate(curr_input_ids):
-                    if self.remove_stopwords and curr_token in stopwords:
+                    if remove_stopwords and curr_token in stopwords:
                         continue
                     if self.embs_consider_boundary and i not in valid_idxs:
                         continue
-                    elif curr_token in [0, 2]:
+                    if curr_token in [0, 2]:
                         continue
                     if is_valid:
                         self.token_idx_to_block_idx.append(len(self.input_ids))
@@ -234,16 +241,19 @@ def load_data(self, data_path):
         self.true_dstore_size = np.sum(true_dstore_size_list)
 
     def load_embeds(self, model_dir):
-        postfix = "_wo_stopwords" if self.remove_stopwords else ""
         if type(model_dir)==list:
             self.embs = []
-            for _model_dir, dstore_size in zip(model_dir, self.dstore_size_list):
+            for shard_idx, (_model_dir, dstore_size) in enumerate(zip(model_dir, self.dstore_size_list)):
+                remove_stopwords = self.remove_stopwords or (
+                    self.remove_stopwords_except_k is not None and shard_idx >= self.remove_stopwords_except_k)
+                postfix = "_wo_stopwords" if remove_stopwords else ""
                 embed_path = os.path.join(_model_dir,
                                             "embeddings{}.float16.npy".format(postfix))
                 print ("Start loading the embed from %s with (%d, %d)..." % (embed_path.split("/")[-2], dstore_size, self.dimension))
                 curr_emb = load_embs(embed_path, dstore_size, self.dimension)
                 self.embs.append(curr_emb)
         else:
+            postfix = "_wo_stopwords" if self.remove_stopwords else ""
             embed_path = os.path.join(model_dir, "embeddings{}.float16.npy".format(postfix))
             print ("Start loading the embed with (%d, %d)..." % (self.dstore_size, self.dimension))
             self.embs = load_embs(embed_path, self.dstore_size, self.dimension)
diff --git a/npm/npm.py b/npm/npm.py
@@ -209,11 +209,8 @@ def get_scores(start_indices, end_indices):
 
         # now, assign scores to possible ngrams
         for (start, end) in all_start_and_end:
-            try:
-                assert start in idx2start_score
-                assert end in idx2end_score
-            except Exception:
-                from IPython import embed; embed(); exit()
+            assert start in idx2start_score
+            assert end in idx2end_score
             score = idx2start_score[start] + idx2end_score[end]
 
             pos2score[(start, end)] = score
diff --git a/npm/utils.py b/npm/utils.py
diff --git a/scripts/demo.py b/scripts/demo.py
@@ -18,17 +18,19 @@
 
 class NPMDemo(object):
 
-    def __init__(self, save_dir, checkpoint_path, k, temperature, remove_stopwords, single,
+    def __init__(self, save_dir, checkpoint_path, k, temperature,
+                 remove_stopwords, remove_stopwords_except_k, single, restricted,
                  embs_consider_boundary, keep_uint8):
         start_time = time.time()
         dstore = DataStore(setting="enwiki",
-                            model_dir=os.path.join(save_dir, "dstore"),
-                            do_load_index=False,
-                            remove_stopwords=remove_stopwords,
-                            restricted=True,
+                           model_dir=os.path.join(save_dir, "dstore"),
+                           do_load_index=False,
+                           remove_stopwords=remove_stopwords,
+                           remove_stopwords_except_k=remove_stopwords_except_k,
+                           restricted=restricted,
                            embs_consider_boundary=embs_consider_boundary,
                            keep_uint8=keep_uint8
-                            )
+                           )
         model_class = SingleModel if single else Model
         model = model_class(checkpoint_path=checkpoint_path)
         print ("Finish loading the model (%dsec)" % (time.time()-start_time))
@@ -49,7 +51,7 @@ def predict(self, text):
         predicted = self.npm.predict_span(text,
                                           ngram_max=10,
                                           valid_func=self.valid_func,
-                                          alphas=[0.0])[0.0]
+                                          alphas=[0.0])["a=0.0"]
         return self.npm.decode(predicted)
 
     def generate(self, text, num_tokens=20, num_masked_tokens=20):
@@ -59,7 +61,7 @@ def generate(self, text, num_tokens=20, num_masked_tokens=20):
             predicted = self.npm.predict_span(input_text,
                                               ngram_max=10,
                                               valid_func=self.valid_func,
-                                              alphas=[0.0])[0.0]
+                                              alphas=[0.0])["a=0.0"]
             predicted = self.npm.decode(predicted)
             text += predicted
         return text
@@ -71,7 +73,9 @@ def main():
     parser.add_argument('--k', type=int, default=4096)
     parser.add_argument('--temperature', type=float, default=1.0)
     parser.add_argument("--remove_stopwords", action="store_true")
+    parser.add_argument("--remove_stopwords_except_k", type=int, default=None)
     parser.add_argument("--single", action="store_true")
+    parser.add_argument("--restricted", action="store_true")
 
     parser.add_argument("--embs_consider_boundary", action="store_true", default=True)
     parser.add_argument("--keep_uint8", action="store_true")
@@ -82,7 +86,9 @@ def main():
                   k=args.k,
                   temperature=args.temperature,
                   remove_stopwords=args.remove_stopwords,
+                  remove_stopwords_except_k=args.remove_stopwords_except_k,
                   single=args.single,
+                  restricted=args.restricted,
                   embs_consider_boundary=args.embs_consider_boundary,
                   keep_uint8=args.keep_uint8)
 
@@ -92,9 +98,10 @@ def predict(text):
         print ("(Took %.2fs)" % (time.time()-start_time))
         return predicted
 
-    input_text = "Hagios Demetrios is located in <mask>."
+    input_text = "Hagios Demetrios is located in"
     print (predict(input_text))
-    from IPython import embed; embed()
+
+    print (npm.generate(input_text))
 
 
 if __name__=='__main__':
diff --git a/scripts/prompt.py b/scripts/prompt.py
@@ -26,6 +26,7 @@ def main():
     parser.add_argument('--temperature', type=float, default=1.0)
     parser.add_argument('--n_samples', type=int, default=3000)
     parser.add_argument("--remove_stopwords", action="store_true")
+    parser.add_argument("--remove_stopwords_except_k", type=int, default=None)
 
     parser.add_argument("--single", action="store_true")
     parser.add_argument("--open", action="store_true")
@@ -56,6 +57,7 @@ def main():
                               model_dir=os.path.join(args.save_dir, "dstore"),
                               do_load_index=not args.restricted,
                               remove_stopwords=args.remove_stopwords,
+                              remove_stopwords_except_k=args.remove_stopwords_except_k,
                               restricted=(True if args.load_all_embs else tasks) if args.restricted else None,
                               embs_consider_boundary=args.embs_consider_boundary,
                               keep_uint8=args.keep_uint8
diff --git a/scripts/save_embeddings.sh b/scripts/save_embeddings.sh
@@ -10,8 +10,9 @@ corpus=$2
 open=$3
 bs=$4
 
-checkpoint_path=$(pwd)/save/${model_name}/model.ckpt
-ctx_embeddings_dir=$(pwd)/save/${model_name}/dstore/${corpus}
+
+out=$(pwd)/save/${model_name}
+ctx_embeddings_dir=${out}/dstore/${corpus}
 
 if [[ $open == "true" ]] ; then
     if [[ $corpus == "enwiki-"* ]] ; then
diff --git a/scripts/train.sh b/scripts/train.sh
@@ -71,8 +71,7 @@ HYDRA_FULL_ERROR=1 PYTHONPATH=. python dpr_scale/main.py -m \
     trainer.gradient_clip_val=${clip} \
     trainer=slurm \
     hydra.launcher.name=${SAVE_DIR} \
-    hydra.sweep.dir=${SAVE_DIR} \
-    hydra.launcher.partition=devlab
+    hydra.sweep.dir=${SAVE_DIR}
 
 
 
diff --git a/train.md b/train.md
@@ -9,8 +9,8 @@ This is a guideline for training the NPM model. The training code is largely bas
     * [Span Masking](#span-masking)
     * [Uniform Masking](#uniform-masking)
 2. [Training](#training)
-3. [Debugging locally](#debugging-locally): see this if you want to do a test run before running the entire pipeline.
-
+    * [Debugging locally](#debugging-locally): see this if you want to do a test run before running the entire pipeline.
+3. [Evaluation](#evaluation)
 
 ## Prepare Training Data
 
@@ -94,11 +94,21 @@ To train NPM-single with uniform masking, run
 bash scripts/train.sh {save_dir} false 3e-05 16 0.15 uniform
 ```
 
-## Debugging Locally
+### Debugging Locally
 If you want a training run on a subset of datas with one local GPU (instead of using slurm and hydra), simply run `scripts/train_debug.sh` instead of `scripts/train.sh` with the same arguments as in the [Training section](#training).
 
 This use RoBERTA-base instead of RoBERTa-large, and can work with >=9GB GPU memory.
 
 Note: This only uses the first shard of English Wikipedia (no CC-News), so if you have not started preprocessing and want to do a test run first, you can preprocess English Wikipedia only and keep CC-News later.
 
+## Evaluation
+Evaluation can be done by following the guidelines for inference in the main [README](README.md).
+
+* Checkpoints are saved every 10,000 training steps. You can find them under `{save_dir}/{hyperparam_settings}/0/lightning_logs/version_{slurm_id}/checkpoints`.
+* When saving embeddings, specify `+task.checkpoint_path=${checkpoint_path}`
+* When running `python -m scripts.prompt`, specify `--checkpoint_path ${checkpoint_path}`
+
+
+
+