|
| 1 | +from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu |
| 2 | +import numpy as np |
| 3 | +import pandas as pd |
| 4 | +import sklearn |
| 5 | +import itertools |
| 6 | +import pyter |
| 7 | + |
| 8 | + |
| 9 | +df = pd.read_csv('./QA-Comparision/COQA.csv') |
| 10 | +df["answer_prompt"] = df["text"].astype(str) + " " + df["question"].astype(str) + " The answer is "+ df["answer"].astype(str) |
| 11 | +df["chatgpt_prompt"] = df["text"].astype(str) + " " + df["question"].astype(str) + " The answer is "+ df["chatgpt_response"].astype(str) |
| 12 | +answer_prompt = [] |
| 13 | +for i in df['answer_prompt']: |
| 14 | + answer_prompt.append(i) |
| 15 | + |
| 16 | +chatgpt_prompt = [] |
| 17 | +for j in df['chatgpt_response']: |
| 18 | + chatgpt_prompt.append(i) |
| 19 | + |
| 20 | +def bleu(ref, gen): |
| 21 | + ''' |
| 22 | + calculate pair wise bleu score. uses nltk implementation |
| 23 | + Args: |
| 24 | + references : a list of reference sentences |
| 25 | + candidates : a list of candidate(generated) sentences |
| 26 | + Returns: |
| 27 | + bleu score(float) |
| 28 | + ''' |
| 29 | + ref_bleu = [] |
| 30 | + gen_bleu = [] |
| 31 | + for l in gen: |
| 32 | + gen_bleu.append(l.split()) |
| 33 | + for i,l in enumerate(ref): |
| 34 | + ref_bleu.append([l.split()]) |
| 35 | + cc = SmoothingFunction() |
| 36 | + score_bleu = corpus_bleu(ref_bleu, gen_bleu, weights=(0, 1, 0, 0), smoothing_function=cc.method4) |
| 37 | + return score_bleu |
| 38 | + |
| 39 | + |
| 40 | + |
| 41 | +#supporting function |
| 42 | +def _split_into_words(sentences): |
| 43 | + """Splits multiple sentences into words and flattens the result""" |
| 44 | + return list(itertools.chain(*[_.split(" ") for _ in sentences])) |
| 45 | + |
| 46 | +#supporting function |
| 47 | +def _get_word_ngrams(n, sentences): |
| 48 | + """Calculates word n-grams for multiple sentences. |
| 49 | + """ |
| 50 | + assert len(sentences) > 0 |
| 51 | + assert n > 0 |
| 52 | + |
| 53 | + words = _split_into_words(sentences) |
| 54 | + return _get_ngrams(n, words) |
| 55 | + |
| 56 | +#supporting function |
| 57 | +def _get_ngrams(n, text): |
| 58 | + """Calcualtes n-grams. |
| 59 | + Args: |
| 60 | + n: which n-grams to calculate |
| 61 | + text: An array of tokens |
| 62 | + Returns: |
| 63 | + A set of n-grams |
| 64 | + """ |
| 65 | + ngram_set = set() |
| 66 | + text_length = len(text) |
| 67 | + max_index_ngram_start = text_length - n |
| 68 | + for i in range(max_index_ngram_start + 1): |
| 69 | + ngram_set.add(tuple(text[i:i + n])) |
| 70 | + return ngram_set |
| 71 | + |
| 72 | +def rouge_n(reference_sentences, evaluated_sentences, n=2): |
| 73 | + """ |
| 74 | + Computes ROUGE-N of two text collections of sentences. |
| 75 | + Source: http://research.microsoft.com/en-us/um/people/cyl/download/ |
| 76 | + papers/rouge-working-note-v1.3.1.pdf |
| 77 | + Args: |
| 78 | + evaluated_sentences: The sentences that have been picked by the summarizer |
| 79 | + reference_sentences: The sentences from the referene set |
| 80 | + n: Size of ngram. Defaults to 2. |
| 81 | + Returns: |
| 82 | + recall rouge score(float) |
| 83 | + Raises: |
| 84 | + ValueError: raises exception if a param has len <= 0 |
| 85 | + """ |
| 86 | + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: |
| 87 | + raise ValueError("Collections must contain at least 1 sentence.") |
| 88 | + |
| 89 | + evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) |
| 90 | + reference_ngrams = _get_word_ngrams(n, reference_sentences) |
| 91 | + reference_count = len(reference_ngrams) |
| 92 | + evaluated_count = len(evaluated_ngrams) |
| 93 | + |
| 94 | + # Gets the overlapping ngrams between evaluated and reference |
| 95 | + overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams) |
| 96 | + overlapping_count = len(overlapping_ngrams) |
| 97 | + |
| 98 | + # Handle edge case. This isn't mathematically correct, but it's good enough |
| 99 | + if evaluated_count == 0: |
| 100 | + precision = 0.0 |
| 101 | + else: |
| 102 | + precision = overlapping_count / evaluated_count |
| 103 | + |
| 104 | + if reference_count == 0: |
| 105 | + recall = 0.0 |
| 106 | + else: |
| 107 | + recall = overlapping_count / reference_count |
| 108 | + |
| 109 | + f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8)) |
| 110 | + |
| 111 | + #just returning recall count in rouge, useful for our purpose |
| 112 | + return recall |
| 113 | + |
| 114 | + |
| 115 | +def ter(ref, gen): |
| 116 | + ''' |
| 117 | + Args: |
| 118 | + ref - reference sentences - in a list |
| 119 | + gen - generated sentences - in a list |
| 120 | + Returns: |
| 121 | + averaged TER score over all sentence pairs |
| 122 | + ''' |
| 123 | + if len(ref) == 1: |
| 124 | + total_score = pyter.ter(gen[0].split(), ref[0].split()) |
| 125 | + print('x') |
| 126 | + else: |
| 127 | + total_score = 0 |
| 128 | + for i in range(len(gen)): |
| 129 | + print(i) |
| 130 | + total_score = total_score + pyter.ter(gen[i].split(), ref[i].split()) |
| 131 | + total_score = total_score/len(gen) |
| 132 | + return total_score |
| 133 | + |
| 134 | +print("Jaccard score: ",sklearn.metrics.jaccard_score(answer_prompt, chatgpt_prompt)) |
| 135 | +print("BLEU score: ",bleu(answer_prompt,chatgpt_prompt)) |
| 136 | +print("Rouge score: ",rouge_n(answer_prompt,chatgpt_prompt)) |
| 137 | +print("TER score: ",ter(answer_prompt,chatgpt_prompt)) |
| 138 | + |
0 commit comments