3
3
import requests
4
4
import re
5
5
import pandas as pd
6
-
6
+ import string
7
7
from elasticsearch import Elasticsearch
8
8
9
9
import tiktoken
19
19
20
20
class ESGPT :
21
21
def __init__ (self , index_name ):
22
- self .es = Elasticsearch (ES_URL , http_auth = (ES_USER , ES_PASS ),
22
+ self .es = Elasticsearch (ES_URL , basic_auth = (ES_USER , ES_PASS ),
23
23
ca_certs = ES_CA_CERT , verify_certs = True )
24
24
self .index_name = index_name
25
25
@@ -59,11 +59,12 @@ def _paper_results_to_text(self, results):
59
59
# Code from https://github.com/openai/openai-cookbook/blob/main/apps/web-crawl-q-and-a/web-qa.py
60
60
# Function to split the text into chunks of a maximum number of tokens
61
61
def _split_into_many (self , text ):
62
+ sentences = []
63
+ for sentence in text .split ('.' ):
64
+ sentence = sentence .strip ()
65
+ if sentence and (any (char .isalpha () for char in sentence ) or any (char .isdigit () for char in sentence )) and (not all (char in string .punctuation for char in sentence )):
66
+ sentences .append (sentence )
62
67
63
- # Split the text into sentences
64
- sentences = text .split ('. ' )
65
-
66
- # Get the number of tokens for each sentence
67
68
n_tokens = [len (self .tokenizer .encode (" " + sentence ))
68
69
for sentence in sentences ]
69
70
@@ -96,16 +97,18 @@ def _split_into_many(self, text):
96
97
97
98
return chunks
98
99
100
+ def _get_embedding (self , input ):
101
+ return openai .Embedding .create (
102
+ input = input , engine = 'text-embedding-ada-002' )['data' ][0 ]['embedding' ]
103
+
99
104
def _create_emb_dict_list (self , long_text ):
100
105
shortened = self ._split_into_many (long_text )
101
106
102
107
embeddings_dict_list = []
103
108
104
109
for text in shortened :
105
110
n_tokens = len (self .tokenizer .encode (text ))
106
- embeddings = openai .Embedding .create (
107
- input = text ,
108
- engine = 'text-embedding-ada-002' )['data' ][0 ]['embedding' ]
111
+ embeddings = self ._get_embedding (input = text )
109
112
embeddings_dict = {}
110
113
embeddings_dict ["text" ] = text
111
114
embeddings_dict ["n_tokens" ] = n_tokens
@@ -120,8 +123,7 @@ def _create_context(self, question, df):
120
123
"""
121
124
122
125
# Get the embeddings for the question
123
- q_embeddings = openai .Embedding .create (
124
- input = question , engine = 'text-embedding-ada-002' )['data' ][0 ]['embedding' ]
126
+ q_embeddings = self ._get_embedding (input = question )
125
127
126
128
# Get the distances from the embeddings
127
129
df ['distances' ] = distances_from_embeddings (
@@ -132,7 +134,6 @@ def _create_context(self, question, df):
132
134
133
135
# Sort by distance and add the text to the context until the context is too long
134
136
for i , row in df .sort_values ('distances' , ascending = True ).iterrows ():
135
-
136
137
# Add the length of the text to the current length
137
138
cur_len += row ['n_tokens' ] + 4
138
139
0 commit comments