modifed masking before pooling

ashok.b · ashok.b · commit 856211d41283 · 2024-01-21T12:01:20.000+05:30
diff --git a/.gitignore b/.gitignore
@@ -1 +1,5 @@
 .idea/
+/cache
+/evaluation/MTEB/mteb.egg-info
+/**/__pycache__
+/InstructorEmbedding.egg-info
diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py
diff --git a/evaluation/MTEB/examples/evaluate_model.py b/evaluation/MTEB/examples/evaluate_model.py
@@ -3,7 +3,7 @@
 import logging
 import argparse
 from mteb import MTEB
-from InstructorEmbedding import INSTRUCTOR
+from InstructorEmbedding import Instructor
 if __name__ == '__main__':
     logging.basicConfig(level=logging.INFO)
     parser = argparse.ArgumentParser()
@@ -24,7 +24,7 @@
     # from functools import partialmethod
     #
     # tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)
-    model = INSTRUCTOR(args.model_name,cache_folder=args.cache_dir)
+    model = Instructor(args.model_name,cache_folder=args.cache_dir)
     evaluation = MTEB(tasks=[args.task_name],task_langs=["en"])
     evaluation.run(model, output_folder=args.output_dir, eval_splits=[args.split],args=args,)
 
diff --git a/evaluation/MTEB/mteb/abstasks/AbsTaskRetrieval.py b/evaluation/MTEB/mteb/abstasks/AbsTaskRetrieval.py
@@ -597,7 +597,7 @@ def evaluate(
         model,
         split="test",
         batch_size=128,
-        corpus_chunk_size=None,
+        corpus_chunk_size=50000,
         target_devices=None,
         score_function="cos_sim",
         **kwargs
@@ -708,7 +708,7 @@ def encode_queries(self, queries: List[str], batch_size: int, **kwargs):
             instruction = DEFINITIONS[self.args.prompt][self.args.task_name]['query']
         if self.args.prompt:
             for s in queries:
-                new_sentences.append([instruction, s, 0])
+                new_sentences.append([instruction, s])
         else:
             new_sentences = queries
 
@@ -717,7 +717,6 @@ def encode_queries(self, queries: List[str], batch_size: int, **kwargs):
 
     def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs):
         self.count += 1
-        # print('count: ',self.count)
         if type(corpus) is dict:
             sentences = [
                 (corpus["title"][i] + ' ' + corpus["text"][i]).strip()
@@ -733,28 +732,26 @@ def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs)
         new_sentences = []
         instruction = DEFINITIONS[self.args.prompt][self.args.task_name]['corpus']
         for s in sentences:
-            new_sentences.append([instruction, s, 0])
-        # kwargs['show_progress_bar'] = False
-        return self.model.encode(sentences, batch_size=128, **kwargs)
+            new_sentences.append([instruction, s])
+        return self.model.encode(new_sentences, batch_size=128, **kwargs)
 
     def encode_corpus_parallel(
         self, corpus: List[Dict[str, str]], pool: Dict[str, object], batch_size: int, chunk_id: int, **kwargs
     ):
+        sentences = []
         instruction = DEFINITIONS[self.args.prompt][self.args.task_name]['corpus']
         if type(corpus) is dict:
-            sentences = [
-                [instruction, (corpus["title"][i] + self.sep + corpus["text"][i]).strip()]
-                (corpus["title"][i] + self.sep + corpus["text"][i]).strip()
-                if "title" in corpus
-                else corpus["text"][i].strip()
-                for i in range(len(corpus["text"]))
-            ]
+            for i in range(len(corpus["text"])):
+                sentence = corpus["text"][i].strip()
+                if "title" in corpus:
+                    sentence = corpus["title"][i].strip() + self.sep + sentence
+                sentences.append([instruction, sentence])
         else:
-            sentences = [
-                [instruction, (doc["title"] + self.sep + doc["text"]).strip()]
-                (doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip()
-                for doc in corpus
-            ]
+            for doc in corpus:
+                sentence = doc["text"].strip()
+                if "title" in doc:
+                    sentence = doc["title"].strip() + self.sep + sentence
+                sentences.append([instruction, sentence])
 
         if chunk_id is not None and chunk_id >= len(pool["processes"]):
             output_queue = pool["output"]
diff --git a/evaluation/MTEB/setup.py b/evaluation/MTEB/setup.py
@@ -84,6 +84,8 @@
         "torch",
         "tqdm",
         "rich",
+        "beir",
+        "evaluate==0.2.0"
     ],
     extras_require=extras,
     classifiers=[
diff --git a/requirements.txt b/requirements.txt
@@ -10,3 +10,4 @@ sentence_transformers>=2.2.0
 torch
 tqdm
 rich
+tensorboard
diff --git a/train.py b/train.py
@@ -13,7 +13,7 @@
 
 import transformers
 from filelock import FileLock
-from InstructorEmbedding import INSTRUCTOR
+from InstructorEmbedding import Instructor, InstructorTransformer
 from transformers import (
     AutoTokenizer,
     DataCollatorForSeq2Seq,
@@ -27,6 +27,9 @@
     set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint
+from transformers.trainer_callback import TrainerCallback, TrainerState, TrainerControl
+from transformers.training_args import TrainingArguments
+
 from transformers.utils import check_min_version, is_offline_mode
 from torch.utils.data import Dataset, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
@@ -100,7 +103,7 @@ def compute_loss(self, model, inputs, return_outputs=False):
             cur_inputs = {
                 'input_ids': inputs[f'{k}_input_ids'],
                 'attention_mask': inputs[f'{k}_attention_mask'],
-                'context_masks': inputs[f'{k}_context_masks'],
+                'instruction_mask': inputs[f'{k}_instruction_mask'],
             }
             cur_results[k] = model(cur_inputs)['sentence_embedding']
         embeddings_query = cur_results['query']
@@ -156,7 +159,6 @@ class ModelArguments:
     """
     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
     """
-
     model_name_or_path: str = field(
         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
     )
@@ -424,13 +426,8 @@ def main():
             )
 
     # Set seed before initializing model.
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-    )
+    instructor_tokenizer = InstructorTransformer(model_name_or_path=model_args.model_name_or_path, load_model=False)
+    tokenizer = instructor_tokenizer.tokenizer #pre-trained tokentizer
 
     set_seed(training_args.seed)
     with open(os.path.join(model_args.cache_dir, 'medi-data.json')) as f:
@@ -443,7 +440,7 @@ def main():
 
     real_batch_size = max(training_args.per_device_train_batch_size,
                           training_args.per_device_train_batch_size * torch.cuda.device_count())
-    # print('real_batch_size: ', real_batch_size,training_args.per_device_train_batch_size,torch.cuda.device_count())
+
     def get_examples_raw(old_examples_raw, total_n, real_batch_size):
         examples_raw = []
         for idx in range(0, total_n, real_batch_size):
@@ -485,13 +482,11 @@ def get_dataset(examples_raw):
         for i in range(total_num):
             cur_e = examples_raw[i]
             for k in ['query','pos','neg']:
-                for s in cur_e[k][:-1]:
-                    assert not '!@#$%^&**!@#$%^&**' in s
                 cur_e[k][-1] = str(cur_e[k][-1])
                 if not data_args.add_prompt_to_document:
                     cur_e[k][0] = ''
                 assert cur_e[k][0].startswith('Represent ') or cur_e[k][0]==''
-                examples[k].append('!@#$%^&**!@#$%^&**'.join(cur_e[k]))
+                examples[k].append(cur_e[k])
             if not cur_e['task_id'] in task_name_map:
                 task_name_map[cur_e['task_id']] = task_count
                 task_count += 1
@@ -500,36 +495,20 @@ def get_dataset(examples_raw):
 
     train_raw_datasets = DatasetDict({'train':Dataset.from_dict(get_dataset(train_examples_raw))})
 
-    model = INSTRUCTOR(real_name_or_path, cache_folder=model_args.cache_dir)
+    model = Instructor(real_name_or_path, cache_folder=model_args.cache_dir)
     column_names = train_raw_datasets["train"].column_names
 
     def preprocess_function(examples):
         all_tokenized = None
         for key in ['query','pos','neg']:
-            num = len(examples[key])
-            contexts = []
-            concatenated_input_texts = []
-            for local_idx in range(num):
-                splits = examples[key][local_idx].split('!@#$%^&**!@#$%^&**')
-                assert len(splits) == 2
-                contexts.append(splits[0])
-                concatenated_input_texts.append(''.join(splits))
-                assert isinstance(contexts[-1], str)
-                assert isinstance(concatenated_input_texts[-1], str)
-            tokenized = tokenizer(concatenated_input_texts,padding='max_length', truncation='longest_first', return_tensors="pt", max_length=data_args.max_source_length)
-            context_tok = tokenizer(contexts,padding='max_length', truncation='longest_first', return_tensors="pt", max_length=data_args.max_source_length)
-            tokenized['context_masks'] = torch.sum(context_tok['attention_mask'], dim=1)
-            tokenized['context_masks'] = tokenized['context_masks'] - 1
-            for my_idx in range(len(tokenized['context_masks'])):
-                if tokenized['context_masks'][my_idx] <= 1:
-                    tokenized['context_masks'][my_idx] = 0
-            keys = tokenized.keys()
+            input_features = instructor_tokenizer.tokenize(examples[key])
+            keys = input_features.keys()
             if all_tokenized is None:
-                all_tokenized = tokenized.copy()
+                all_tokenized = input_features.copy()
                 for k in keys:
                     all_tokenized[k] = all_tokenized[k].tolist()
             for k in keys:
-                all_tokenized[f'{key}_{k}'] = tokenized[k].tolist()
+                all_tokenized[f'{key}_{k}'] = input_features[k].tolist()
         all_tokenized['task_id'] = examples['task_id']
         return all_tokenized
 

-Original file line number
+Diff line change
 torch
 tqdm
 rich
 +tensorboard