9
9
import tiktoken
10
10
import openai
11
11
from openai .embeddings_utils import distances_from_embeddings
12
+ from emb import get_embedding
12
13
13
14
14
15
ES_URL = os .environ ["ES_URL" ]
@@ -29,6 +30,7 @@ def __init__(self, index_name):
29
30
self .api_key = os .environ ["OPENAI_API_KEY" ]
30
31
openai .api_key = self .api_key
31
32
self .max_tokens = 1000
33
+ self .split_max_tokens = 500
32
34
33
35
# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
34
36
self .tokenizer = tiktoken .get_encoding ("cl100k_base" )
@@ -60,9 +62,9 @@ def _paper_results_to_text(self, results):
60
62
# Function to split the text into chunks of a maximum number of tokens
61
63
def _split_into_many (self , text ):
62
64
sentences = []
63
- for sentence in text .split ('.' ):
65
+ for sentence in re .split (r'[{}]' . format ( string . punctuation ), text ):
64
66
sentence = sentence .strip ()
65
- if sentence and (any (char .isalpha () for char in sentence ) or any (char .isdigit () for char in sentence )) and ( not all ( char in string . punctuation for char in sentence )) :
67
+ if sentence and (any (char .isalpha () for char in sentence ) or any (char .isdigit () for char in sentence )):
66
68
sentences .append (sentence )
67
69
68
70
n_tokens = [len (self .tokenizer .encode (" " + sentence ))
@@ -77,14 +79,14 @@ def _split_into_many(self, text):
77
79
# If the number of tokens so far plus the number of tokens in the current sentence is greater
78
80
# than the max number of tokens, then add the chunk to the list of chunks and reset
79
81
# the chunk and tokens so far
80
- if tokens_so_far + token > self .max_tokens :
82
+ if tokens_so_far + token > self .split_max_tokens and chunk :
81
83
chunks .append (". " .join (chunk ) + "." )
82
84
chunk = []
83
85
tokens_so_far = 0
84
86
85
87
# If the number of tokens in the current sentence is greater than the max number of
86
88
# tokens, go to the next sentence
87
- if token > self .max_tokens :
89
+ if token > self .split_max_tokens :
88
90
continue
89
91
90
92
# Otherwise, add the sentence to the chunk and add the number of tokens to the total
@@ -97,9 +99,6 @@ def _split_into_many(self, text):
97
99
98
100
return chunks
99
101
100
- def _get_embedding (self , input ):
101
- return openai .Embedding .create (
102
- input = input , engine = 'text-embedding-ada-002' )['data' ][0 ]['embedding' ]
103
102
104
103
def _create_emb_dict_list (self , long_text ):
105
104
shortened = self ._split_into_many (long_text )
@@ -108,7 +107,7 @@ def _create_emb_dict_list(self, long_text):
108
107
109
108
for text in shortened :
110
109
n_tokens = len (self .tokenizer .encode (text ))
111
- embeddings = self . _get_embedding (input = text )
110
+ embeddings = get_embedding (input = text )
112
111
embeddings_dict = {}
113
112
embeddings_dict ["text" ] = text
114
113
embeddings_dict ["n_tokens" ] = n_tokens
@@ -123,7 +122,7 @@ def _create_context(self, question, df):
123
122
"""
124
123
125
124
# Get the embeddings for the question
126
- q_embeddings = self . _get_embedding (input = question )
125
+ q_embeddings = get_embedding (input = question )
127
126
128
127
# Get the distances from the embeddings
129
128
df ['distances' ] = distances_from_embeddings (
0 commit comments