added evaluation scores

aman-17 · aman-17 · commit 1cf809542c20 · 2023-02-09T00:29:59.000-06:00
diff --git a/scores/evaluate.py b/scores/evaluate.py
@@ -0,0 +1,138 @@
+from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu
+import numpy as np
+import pandas as pd
+import sklearn
+import itertools
+import pyter
+
+
+df = pd.read_csv('./QA-Comparision/COQA.csv')
+df["answer_prompt"] = df["text"].astype(str) + " " + df["question"].astype(str) + " The answer is "+ df["answer"].astype(str)
+df["chatgpt_prompt"] = df["text"].astype(str) + " " + df["question"].astype(str) + " The answer is "+ df["chatgpt_response"].astype(str)
+answer_prompt = []
+for i in df['answer_prompt']:
+    answer_prompt.append(i)
+
+chatgpt_prompt = []
+for j in df['chatgpt_response']:
+    chatgpt_prompt.append(i)
+
+def bleu(ref, gen):
+    ''' 
+    calculate pair wise bleu score. uses nltk implementation
+    Args:
+        references : a list of reference sentences 
+        candidates : a list of candidate(generated) sentences
+    Returns:
+        bleu score(float)
+    '''
+    ref_bleu = []
+    gen_bleu = []
+    for l in gen:
+        gen_bleu.append(l.split())
+    for i,l in enumerate(ref):
+        ref_bleu.append([l.split()])
+    cc = SmoothingFunction()
+    score_bleu = corpus_bleu(ref_bleu, gen_bleu, weights=(0, 1, 0, 0), smoothing_function=cc.method4)
+    return score_bleu
+
+
+
+#supporting function
+def _split_into_words(sentences):
+  """Splits multiple sentences into words and flattens the result"""
+  return list(itertools.chain(*[_.split(" ") for _ in sentences]))
+
+#supporting function
+def _get_word_ngrams(n, sentences):
+  """Calculates word n-grams for multiple sentences.
+  """
+  assert len(sentences) > 0
+  assert n > 0
+
+  words = _split_into_words(sentences)
+  return _get_ngrams(n, words)
+
+#supporting function
+def _get_ngrams(n, text):
+  """Calcualtes n-grams.
+  Args:
+    n: which n-grams to calculate
+    text: An array of tokens
+  Returns:
+    A set of n-grams
+  """
+  ngram_set = set()
+  text_length = len(text)
+  max_index_ngram_start = text_length - n
+  for i in range(max_index_ngram_start + 1):
+    ngram_set.add(tuple(text[i:i + n]))
+  return ngram_set
+
+def rouge_n(reference_sentences, evaluated_sentences, n=2):
+  """
+  Computes ROUGE-N of two text collections of sentences.
+  Source: http://research.microsoft.com/en-us/um/people/cyl/download/
+  papers/rouge-working-note-v1.3.1.pdf
+  Args:
+    evaluated_sentences: The sentences that have been picked by the summarizer
+    reference_sentences: The sentences from the referene set
+    n: Size of ngram.  Defaults to 2.
+  Returns:
+    recall rouge score(float)
+  Raises:
+    ValueError: raises exception if a param has len <= 0
+  """
+  if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
+    raise ValueError("Collections must contain at least 1 sentence.")
+
+  evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
+  reference_ngrams = _get_word_ngrams(n, reference_sentences)
+  reference_count = len(reference_ngrams)
+  evaluated_count = len(evaluated_ngrams)
+
+  # Gets the overlapping ngrams between evaluated and reference
+  overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
+  overlapping_count = len(overlapping_ngrams)
+
+  # Handle edge case. This isn't mathematically correct, but it's good enough
+  if evaluated_count == 0:
+    precision = 0.0
+  else:
+    precision = overlapping_count / evaluated_count
+
+  if reference_count == 0:
+    recall = 0.0
+  else:
+    recall = overlapping_count / reference_count
+
+  f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
+
+  #just returning recall count in rouge, useful for our purpose
+  return recall
+
+
+def ter(ref, gen):
+    '''
+    Args:
+        ref - reference sentences - in a list
+        gen - generated sentences - in a list
+    Returns:
+        averaged TER score over all sentence pairs
+    '''
+    if len(ref) == 1:
+        total_score =  pyter.ter(gen[0].split(), ref[0].split())
+        print('x')
+    else:
+        total_score = 0
+        for i in range(len(gen)):
+            print(i)
+            total_score = total_score + pyter.ter(gen[i].split(), ref[i].split())
+        total_score = total_score/len(gen)
+    return total_score
+
+print("Jaccard score: ",sklearn.metrics.jaccard_score(answer_prompt, chatgpt_prompt))
+print("BLEU score: ",bleu(answer_prompt,chatgpt_prompt))
+print("Rouge score: ",rouge_n(answer_prompt,chatgpt_prompt))
+print("TER score: ",ter(answer_prompt,chatgpt_prompt))
+