Minor fixes in splits

hunkim · hunkim · commit dede55ed1a43 · 2023-02-20T14:52:34.000+09:00
diff --git a/Makefile b/Makefile
@@ -2,6 +2,7 @@ VENV = .venv
 PYTHON = $(VENV)/bin/python3
 PIP = $(VENV)/bin/pip3
 UVICORN = $(VENV)/bin/uvicorn
+PYTEST = $(VENV)/bin/pytest
 
 include .env
 export
@@ -17,6 +18,8 @@ crawl: $(VENV)/bin/activate
 esgpt: $(VENV)/bin/activate
 	$(PYTHON) es_gpt.py
 
+test: $(VENV)/bin/activate
+	$(PYTEST) --verbose es_gpt_test.py -s -vv
 app: $(VENV)/bin/activate
 	$(UVICORN) app:app --reload --port 7002
 
diff --git a/es_gpt.py b/es_gpt.py
@@ -3,7 +3,7 @@
 import requests
 import re
 import pandas as pd
-
+import string
 from elasticsearch import Elasticsearch
 
 import tiktoken
@@ -19,7 +19,7 @@
 
 class ESGPT:
     def __init__(self, index_name):
-        self.es = Elasticsearch(ES_URL, http_auth=(ES_USER, ES_PASS),
+        self.es = Elasticsearch(ES_URL, basic_auth=(ES_USER, ES_PASS),
                                 ca_certs=ES_CA_CERT, verify_certs=True)
         self.index_name = index_name
 
@@ -59,11 +59,12 @@ def _paper_results_to_text(self, results):
     # Code from https://github.com/openai/openai-cookbook/blob/main/apps/web-crawl-q-and-a/web-qa.py
     # Function to split the text into chunks of a maximum number of tokens
     def _split_into_many(self, text):
+        sentences = []
+        for sentence in text.split('.'):
+            sentence = sentence.strip()
+            if sentence and (any(char.isalpha() for char in sentence) or any(char.isdigit() for char in sentence)) and (not all(char in string.punctuation for char in sentence)):
+                sentences.append(sentence)
 
-        # Split the text into sentences
-        sentences = text.split('. ')
-
-        # Get the number of tokens for each sentence
         n_tokens = [len(self.tokenizer.encode(" " + sentence))
                     for sentence in sentences]
 
@@ -96,16 +97,18 @@ def _split_into_many(self, text):
 
         return chunks
 
+    def _get_embedding(self, input):
+        return openai.Embedding.create(
+            input=input, engine='text-embedding-ada-002')['data'][0]['embedding']
+
     def _create_emb_dict_list(self, long_text):
         shortened = self._split_into_many(long_text)
 
         embeddings_dict_list = []
 
         for text in shortened:
             n_tokens = len(self.tokenizer.encode(text))
-            embeddings = openai.Embedding.create(
-                input=text,
-                engine='text-embedding-ada-002')['data'][0]['embedding']
+            embeddings = self._get_embedding(input=text)
             embeddings_dict = {}
             embeddings_dict["text"] = text
             embeddings_dict["n_tokens"] = n_tokens
@@ -120,8 +123,7 @@ def _create_context(self, question, df):
         """
 
         # Get the embeddings for the question
-        q_embeddings = openai.Embedding.create(
-            input=question, engine='text-embedding-ada-002')['data'][0]['embedding']
+        q_embeddings = self._get_embedding(input=question)
 
         # Get the distances from the embeddings
         df['distances'] = distances_from_embeddings(
@@ -132,7 +134,6 @@ def _create_context(self, question, df):
 
         # Sort by distance and add the text to the context until the context is too long
         for i, row in df.sort_values('distances', ascending=True).iterrows():
-
             # Add the length of the text to the current length
             cur_len += row['n_tokens'] + 4
 
diff --git a/requirements.txt b/requirements.txt
@@ -10,4 +10,5 @@ matplotlib
 plotly
 pandas
 scipy
-scikit-learn
+scikit-learn
+pytest