Added sentence transformer for emb

hunkim · hunkim · commit 27142a486ab6 · 2023-02-20T18:52:23.000+09:00
diff --git a/Makefile b/Makefile
@@ -12,6 +12,9 @@ $(VENV)/bin/activate: requirements.txt
 	python3 -m venv $(VENV)
 	$(PIP) install -r requirements.txt
 
+emb: $(VENV)/bin/activate
+	$(PYTHON) emb.py
+	
 crawl: $(VENV)/bin/activate
 	$(PYTHON) crawl_index.py
 
@@ -20,6 +23,7 @@ esgpt: $(VENV)/bin/activate
 
 test: $(VENV)/bin/activate
 	$(PYTEST) --verbose es_gpt_test.py -s -vv
+
 app: $(VENV)/bin/activate
 	$(UVICORN) app:app --reload --port 7002
 
diff --git a/emb.py b/emb.py
@@ -0,0 +1,32 @@
+import os
+import openai
+from sentence_transformers import SentenceTransformer
+
+
+EMB_USE_OPENAI = os.getenv('EMB_USE_OPENAI', '0')
+
+
+def _get_openai_embedding(input):
+    openai.api_key = os.environ["OPENAI_API_KEY"]
+    return openai.Embedding.create(
+        input=input, engine='text-embedding-ada-002')['data'][0]['embedding']
+
+
+def _get_transformer_embedding(input):
+    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
+
+    # Sentences are encoded by calling model.encode()
+    embedding = model.encode(input)
+    return embedding
+
+
+def get_embedding(input):
+    if EMB_USE_OPENAI == '1':
+        return _get_openai_embedding(input)
+    else:
+        return _get_transformer_embedding(input)
+
+
+if __name__ == "__main__":
+    print("Transformer: ", _get_transformer_embedding('hello world')[0])
+    print("OpenAI: ", _get_openai_embedding('hello world'))
diff --git a/es_gpt.py b/es_gpt.py
@@ -9,6 +9,7 @@
 import tiktoken
 import openai
 from openai.embeddings_utils import distances_from_embeddings
+from emb import get_embedding
 
 
 ES_URL = os.environ["ES_URL"]
@@ -29,6 +30,7 @@ def __init__(self, index_name):
         self.api_key = os.environ["OPENAI_API_KEY"]
         openai.api_key = self.api_key
         self.max_tokens = 1000
+        self.split_max_tokens = 500
 
         # Load the cl100k_base tokenizer which is designed to work with the ada-002 model
         self.tokenizer = tiktoken.get_encoding("cl100k_base")
@@ -60,9 +62,9 @@ def _paper_results_to_text(self, results):
     # Function to split the text into chunks of a maximum number of tokens
     def _split_into_many(self, text):
         sentences = []
-        for sentence in text.split('.'):
+        for sentence in re.split(r'[{}]'.format(string.punctuation), text):
             sentence = sentence.strip()
-            if sentence and (any(char.isalpha() for char in sentence) or any(char.isdigit() for char in sentence)) and (not all(char in string.punctuation for char in sentence)):
+            if sentence and (any(char.isalpha() for char in sentence) or any(char.isdigit() for char in sentence)):
                 sentences.append(sentence)
 
         n_tokens = [len(self.tokenizer.encode(" " + sentence))
@@ -77,14 +79,14 @@ def _split_into_many(self, text):
             # If the number of tokens so far plus the number of tokens in the current sentence is greater
             # than the max number of tokens, then add the chunk to the list of chunks and reset
             # the chunk and tokens so far
-            if tokens_so_far + token > self.max_tokens:
+            if tokens_so_far + token > self.split_max_tokens and chunk:
                 chunks.append(". ".join(chunk) + ".")
                 chunk = []
                 tokens_so_far = 0
 
             # If the number of tokens in the current sentence is greater than the max number of
             # tokens, go to the next sentence
-            if token > self.max_tokens:
+            if token > self.split_max_tokens:
                 continue
 
             # Otherwise, add the sentence to the chunk and add the number of tokens to the total
@@ -97,9 +99,6 @@ def _split_into_many(self, text):
 
         return chunks
 
-    def _get_embedding(self, input):
-        return openai.Embedding.create(
-            input=input, engine='text-embedding-ada-002')['data'][0]['embedding']
 
     def _create_emb_dict_list(self, long_text):
         shortened = self._split_into_many(long_text)
@@ -108,7 +107,7 @@ def _create_emb_dict_list(self, long_text):
 
         for text in shortened:
             n_tokens = len(self.tokenizer.encode(text))
-            embeddings = self._get_embedding(input=text)
+            embeddings = get_embedding(input=text)
             embeddings_dict = {}
             embeddings_dict["text"] = text
             embeddings_dict["n_tokens"] = n_tokens
@@ -123,7 +122,7 @@ def _create_context(self, question, df):
         """
 
         # Get the embeddings for the question
-        q_embeddings = self._get_embedding(input=question)
+        q_embeddings = get_embedding(input=question)
 
         # Get the distances from the embeddings
         df['distances'] = distances_from_embeddings(
diff --git a/requirements.txt b/requirements.txt
@@ -11,4 +11,5 @@ plotly
 pandas
 scipy
 scikit-learn
-pytest
+pytest
+sentence-transformers