Skip to content

Commit 1cf8095

Browse files
committed
added evaluation scores
1 parent df979d7 commit 1cf8095

File tree

1 file changed

+138
-0
lines changed

1 file changed

+138
-0
lines changed

scores/evaluate.py

+138
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu
2+
import numpy as np
3+
import pandas as pd
4+
import sklearn
5+
import itertools
6+
import pyter
7+
8+
9+
df = pd.read_csv('./QA-Comparision/COQA.csv')
10+
df["answer_prompt"] = df["text"].astype(str) + " " + df["question"].astype(str) + " The answer is "+ df["answer"].astype(str)
11+
df["chatgpt_prompt"] = df["text"].astype(str) + " " + df["question"].astype(str) + " The answer is "+ df["chatgpt_response"].astype(str)
12+
answer_prompt = []
13+
for i in df['answer_prompt']:
14+
answer_prompt.append(i)
15+
16+
chatgpt_prompt = []
17+
for j in df['chatgpt_response']:
18+
chatgpt_prompt.append(i)
19+
20+
def bleu(ref, gen):
21+
'''
22+
calculate pair wise bleu score. uses nltk implementation
23+
Args:
24+
references : a list of reference sentences
25+
candidates : a list of candidate(generated) sentences
26+
Returns:
27+
bleu score(float)
28+
'''
29+
ref_bleu = []
30+
gen_bleu = []
31+
for l in gen:
32+
gen_bleu.append(l.split())
33+
for i,l in enumerate(ref):
34+
ref_bleu.append([l.split()])
35+
cc = SmoothingFunction()
36+
score_bleu = corpus_bleu(ref_bleu, gen_bleu, weights=(0, 1, 0, 0), smoothing_function=cc.method4)
37+
return score_bleu
38+
39+
40+
41+
#supporting function
42+
def _split_into_words(sentences):
43+
"""Splits multiple sentences into words and flattens the result"""
44+
return list(itertools.chain(*[_.split(" ") for _ in sentences]))
45+
46+
#supporting function
47+
def _get_word_ngrams(n, sentences):
48+
"""Calculates word n-grams for multiple sentences.
49+
"""
50+
assert len(sentences) > 0
51+
assert n > 0
52+
53+
words = _split_into_words(sentences)
54+
return _get_ngrams(n, words)
55+
56+
#supporting function
57+
def _get_ngrams(n, text):
58+
"""Calcualtes n-grams.
59+
Args:
60+
n: which n-grams to calculate
61+
text: An array of tokens
62+
Returns:
63+
A set of n-grams
64+
"""
65+
ngram_set = set()
66+
text_length = len(text)
67+
max_index_ngram_start = text_length - n
68+
for i in range(max_index_ngram_start + 1):
69+
ngram_set.add(tuple(text[i:i + n]))
70+
return ngram_set
71+
72+
def rouge_n(reference_sentences, evaluated_sentences, n=2):
73+
"""
74+
Computes ROUGE-N of two text collections of sentences.
75+
Source: http://research.microsoft.com/en-us/um/people/cyl/download/
76+
papers/rouge-working-note-v1.3.1.pdf
77+
Args:
78+
evaluated_sentences: The sentences that have been picked by the summarizer
79+
reference_sentences: The sentences from the referene set
80+
n: Size of ngram. Defaults to 2.
81+
Returns:
82+
recall rouge score(float)
83+
Raises:
84+
ValueError: raises exception if a param has len <= 0
85+
"""
86+
if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
87+
raise ValueError("Collections must contain at least 1 sentence.")
88+
89+
evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
90+
reference_ngrams = _get_word_ngrams(n, reference_sentences)
91+
reference_count = len(reference_ngrams)
92+
evaluated_count = len(evaluated_ngrams)
93+
94+
# Gets the overlapping ngrams between evaluated and reference
95+
overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
96+
overlapping_count = len(overlapping_ngrams)
97+
98+
# Handle edge case. This isn't mathematically correct, but it's good enough
99+
if evaluated_count == 0:
100+
precision = 0.0
101+
else:
102+
precision = overlapping_count / evaluated_count
103+
104+
if reference_count == 0:
105+
recall = 0.0
106+
else:
107+
recall = overlapping_count / reference_count
108+
109+
f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
110+
111+
#just returning recall count in rouge, useful for our purpose
112+
return recall
113+
114+
115+
def ter(ref, gen):
116+
'''
117+
Args:
118+
ref - reference sentences - in a list
119+
gen - generated sentences - in a list
120+
Returns:
121+
averaged TER score over all sentence pairs
122+
'''
123+
if len(ref) == 1:
124+
total_score = pyter.ter(gen[0].split(), ref[0].split())
125+
print('x')
126+
else:
127+
total_score = 0
128+
for i in range(len(gen)):
129+
print(i)
130+
total_score = total_score + pyter.ter(gen[i].split(), ref[i].split())
131+
total_score = total_score/len(gen)
132+
return total_score
133+
134+
print("Jaccard score: ",sklearn.metrics.jaccard_score(answer_prompt, chatgpt_prompt))
135+
print("BLEU score: ",bleu(answer_prompt,chatgpt_prompt))
136+
print("Rouge score: ",rouge_n(answer_prompt,chatgpt_prompt))
137+
print("TER score: ",ter(answer_prompt,chatgpt_prompt))
138+

0 commit comments

Comments
 (0)