IIT-DM
diff --git a/‎BERT_similarity/SNLI_Corpus/snli_1.0_dev.csv
+10,001 b/‎BERT_similarity/SNLI_Corpus/snli_1.0_dev.csv
+10,001
diff --git a/‎BERT_similarity/SNLI_Corpus/snli_1.0_test.csv
+10,001 b/‎BERT_similarity/SNLI_Corpus/snli_1.0_test.csv
+10,001
diff --git a/‎BERT_similarity/SNLI_Corpus/snli_1.0_train.csv
+550,153 b/‎BERT_similarity/SNLI_Corpus/snli_1.0_train.csv
+550,153
diff --git a/‎BERT_similarity/pretrained_QA.py
+67 b/‎BERT_similarity/pretrained_QA.py
+67
diff --git a/‎BERT_similarity/test_BERT.py
+47 b/‎BERT_similarity/test_BERT.py
+47
diff --git a/‎BERT_similarity/train_BERT.py
+145 b/‎BERT_similarity/train_BERT.py
+145
diff --git a/‎fact_check.py
+96 b/‎fact_check.py
+96
diff --git a/‎post_process.py
+21 b/‎post_process.py
+21
@@ -0,0 +1,67 @@
+import pandas as pd
+import numpy as np
+
+import torch
+import torch.nn as nn
+from transformers import BertForQuestionAnswering
+from transformers import BertTokenizer
+
+
+def data_processing_coqa(dataset):
+
+    coqa = pd.read_json(dataset)
+    del coqa["version"]
+    coqa_columns = ["text","question","answer"]
+    new_data_list = []
+    for index, row in coqa.iterrows():
+        for i in range(len(row["data"]["questions"])):
+            temp = []
+            temp.append(row["data"]["story"])
+            temp.append(row["data"]["questions"][i]["input_text"])
+            temp.append(row["data"]["answers"][i]["input_text"])
+            new_data_list.append(temp)
+    coqa_new = pd.DataFrame(new_data_list, columns=coqa_columns)
+    return coqa_new.to_csv("CoQA_dataset.csv", index=False)
+
+
+
+def evaluating_QA_BERT(question, text):
+    load_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
+    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
+    input_ids = tokenizer.encode(question, text)
+    tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    seq_count = input_ids.index(tokenizer.sep_token_id)
+    num_seg_a = seq_count+1
+    num_seg_b = len(input_ids) - num_seg_a
+    segment_ids = [0]*num_seg_a + [1]*num_seg_b
+    assert len(segment_ids) == len(input_ids)
+    
+    output = load_model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
+    
+    answer_start = torch.argmax(output.start_logits)
+    answer_end = torch.argmax(output.end_logits)
+
+    if answer_end >= answer_start:
+        answer = tokens[answer_start]
+        for i in range(answer_start+1, answer_end+1):
+            if tokens[i][0:2] == "##":
+                answer += tokens[i][2:]
+            else:
+                answer += " " + tokens[i]
+                
+    if answer.startswith("[CLS]"):
+        answer = "Unable to find the answer to your question."
+    print("\nAnswer:\n{}".format(answer.capitalize()))
+
+
+text = """
+My name is Aman. I am 22 years old.
+I am an out-of-the-box thinker with excellent programming and analytical skills. 
+I create deployable, learning-enabled, and resource-constrained systems. 
+My research interests include computer vision, image processing, deep learning, and human-computer interaction. 
+I love traveling to new places, eating different food, and connecting with new people. Tech enthusiast, explorer, and pacifist.
+"""
+question = "What are the research interests of Aman?"
+
+# evaluating_QA_BERT(question, text)
+
@@ -0,0 +1,47 @@
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import transformers
+from train_BERT import DataLoader
+
+load_model = tf.keras.models.load_model('./sematic-similarity.h5')
+labels = ["contradiction", "entailment", "neutral"]
+def check_similarity(sentence1, sentence2):
+    sentences = np.array([[str(sentence1), str(sentence2)]])
+    test_data = DataLoader(sentences, labels=None, batch_size=1, shuffle=False, include_labels=False)
+
+    proba = load_model.predict(test_data[0])[0]
+    idx = np.argmax(proba)
+    proba = f"{proba[idx]: .2f}%"
+    pred = labels[idx]
+    print(pred)
+    # print(proba)
+    return proba, pred, sentence1, sentence2
+
+df = pd.read_csv('../DialFact_withResponse.csv')
+# df = df.iloc[10:]
+# df["answer_prompt"] = df["context"].astype(str) + " " + df["question"].astype(str) + " The answer is "+ df["answer"].astype(str)
+# df["chatgpt_prompt"] = df["text"].astype(str) + " " + df["question"].astype(str) + " The answer is "+ df["chatgpt_response"].astype(str)
+
+answer_prompt = []
+for i in df['response']:
+    answer_prompt.append(i)
+
+chatgpt_prompt = []
+for j in df['chatgpt_response']:
+    chatgpt_prompt.append(i)
+
+similarity_p=[]
+label_p=[]
+for i, j in zip(answer_prompt, chatgpt_prompt):
+    similarity = check_similarity(i, j)[0]
+    label = check_similarity(i, j)[1]
+    print(label)
+    similarity_p.append(similarity)
+    label_p.append(label)
+
+
+df["BERT similarity"] = similarity_p
+df["NLI Label"] = label_p
+
+df.to_csv("DialFact_withGoldenR_label.csv", index=True)
@@ -0,0 +1,145 @@
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import transformers
+
+
+labels = ["contradiction", "entailment", "neutral"]
+
+x_train = pd.read_csv("./SNLI_Corpus/snli_1.0_train.csv", nrows=100000)
+x_val = pd.read_csv("./SNLI_Corpus/snli_1.0_dev.csv")
+x_test = pd.read_csv("./SNLI_Corpus/snli_1.0_test.csv")
+x_train.dropna(axis=0, inplace=True)
+
+x_train = (
+    x_train[x_train.similarity != "-"]
+    .sample(frac=1.0, random_state=42)
+    .reset_index(drop=True)
+)
+x_val = (
+    x_val[x_val.similarity != "-"]
+    .sample(frac=1.0, random_state=42)
+    .reset_index(drop=True)
+)
+
+# print(f"Total train samples : {x_train.shape[0]}")
+# print(f"Total validation samples: {x_val.shape[0]}")
+# print(f"Total test samples: {x_val.shape[0]}")
+# print(x_train.similarity.value_counts())
+
+x_train["label"] = x_train["similarity"].apply(lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2)
+x_val["label"] = x_val["similarity"].apply(lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2)
+x_test["label"] = x_test["similarity"].apply(lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2)
+
+y_train = tf.keras.utils.to_categorical(x_train.label, num_classes=3)
+y_val = tf.keras.utils.to_categorical(x_val.label, num_classes=3)
+y_test = tf.keras.utils.to_categorical(x_test.label, num_classes=3)
+
+
+max_length = 128 
+batch_size = 32
+epochs = 2
+
+class DataLoader(tf.keras.utils.Sequence):
+    """Generates batches of data.
+
+    Args:
+        sentences: Array of premise and hypothesis input sentences.
+        labels: Array of labels.
+        batch_size: Integer batch size.
+        shuffle: boolean, whether to shuffle the data.
+        include_labels: boolean, whether to incude the labels.
+
+    Returns:
+        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
+        (or just `[input_ids, attention_mask, `token_type_ids]`
+         if `include_labels=False`)
+    """
+
+    def __init__(
+        self,
+        sentences,
+        labels,
+        batch_size=batch_size,
+        shuffle=True,
+        include_labels=True,
+    ):
+        self.sentences = sentences
+        self.labels = labels
+        self.shuffle = shuffle
+        self.batch_size = batch_size
+        self.include_labels = include_labels
+        self.tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
+        self.indexes = np.arange(len(self.sentences))
+        self.shuffle_data()
+
+    def __len__(self):
+        return len(self.sentences) // self.batch_size
+
+    def __getitem__(self, idx):
+        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
+        sentences = self.sentences[indexes]
+        encoded = self.tokenizer.batch_encode_plus(
+            sentences.tolist(),
+            add_special_tokens=True,
+            max_length=max_length,
+            return_attention_mask=True,
+            return_token_type_ids=True,
+            pad_to_max_length=True,
+            return_tensors="tf",
+        )
+
+        sentence_inputs = np.array(encoded["input_ids"], dtype="int32")
+        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
+        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")
+
+        if self.include_labels:
+            labels = np.array(self.labels[indexes], dtype="int32")
+            return [sentence_inputs, attention_masks, token_type_ids], labels
+        else:
+            return [sentence_inputs, attention_masks, token_type_ids]
+
+    def shuffle_data(self):
+        if self.shuffle:
+            np.random.RandomState(50).shuffle(self.indexes)
+
+
+sentence_inputs = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
+attention_masks = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="attention_masks")
+token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="token_type_ids")
+
+bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
+bert_model.trainable = True
+bert_output = bert_model.bert(sentence_inputs, attention_mask=attention_masks, token_type_ids=token_type_ids)
+sequence_output = bert_output.last_hidden_state
+
+# pooled_output = bert_output.pooler_output
+lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(sequence_output)
+avg_pool = tf.keras.layers.GlobalAveragePooling1D()(lstm)
+max_pool = tf.keras.layers.GlobalMaxPooling1D()(lstm)
+x = tf.keras.layers.concatenate([avg_pool, max_pool])
+x = tf.keras.layers.Dropout(0.3)(x)
+out = tf.keras.layers.Dense(3, activation="softmax")(x)
+model = tf.keras.models.Model(inputs=[sentence_inputs, attention_masks, token_type_ids], outputs=out)
+
+# model.summary()
+
+train_data = DataLoader(
+    x_train[["sentence1", "sentence2"]].values.astype("str"),
+    y_train,
+    batch_size=batch_size,
+    shuffle=True,
+)
+valid_data = DataLoader(
+    x_val[["sentence1", "sentence2"]].values.astype("str"),
+    y_val,
+    batch_size=batch_size,
+    shuffle=False,
+)
+test_data = DataLoader(
+    x_test[["sentence1", "sentence2"]].values.astype("str"),
+    y_test,
+    batch_size=batch_size,
+    shuffle=False,
+)
+
@@ -0,0 +1,96 @@
+import json
+import torch
+import torch.nn as nn
+from torch.optim.lr_scheduler import StepLR
+import random
+from tqdm import tqdm
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+
+num_epochs = 8
+supporting_texts = json.load(open('./supporting.json'))
+refuting_texts = json.load(open('./refuting.json'))
+
+
+split = 0.8
+
+_prompt = '\n\n\nThe evidence supports the claim:\n'
+train_list = [item + _prompt + 'Yes.' for item in supporting_texts]
+train_list += [item + _prompt + 'Nope.' for item in refuting_texts]
+random.shuffle(train_list)
+
+json.dump(train_list, open('./train_list.json', 'w'))
+
+train_list = json.load(open('./train_list.json'))
+
+print(train_list[0])
+
+def chunks(lst, n):
+    for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+
+def batchify(data, n):
+    len_dict = {}
+    for item in data:
+        length = item.shape[1]
+        try:
+            len_dict[length].append(item)
+        except:
+            len_dict[length] = [item]
+
+    batch_chunks = []
+    for k in len_dict.keys():
+        vectors = len_dict[k]
+        batch_chunks += chunks(vectors, n)
+
+    batches = []
+    for chunk in batch_chunks:
+        inputs = torch.stack([item[0] for item in chunk])
+        batches.append((inputs))
+
+    return batches
+
+
+tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+model = GPT2LMHeadModel.from_pretrained('gpt2')
+model.cuda()
+criterion = nn.BCEWithLogitsLoss()
+optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
+
+_limit = 1024
+data = []
+total_skipped = 0
+for item in train_list:
+    tokens = tokenizer.encode(item, return_tensors='pt')
+    if tokens.shape[1] > _limit:
+        total_skipped += 1
+        continue
+    data.append(tokens)
+print(f'Skipped {total_skipped} out of {len(train_list)}')
+
+train_batches = batchify(data, 1)
+
+def train(train_model, batches, optimizer, criterion):
+    total_loss = 0.
+    for i, batch in tqdm(enumerate(batches), total=len(batches)):
+        model.train()
+        inputs = batch
+        optimizer.zero_grad()
+        loss = train_model(inputs.cuda(), labels=inputs.cuda())[0]
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(train_model.parameters(), 0.5)
+        optimizer.step()
+        total_loss += loss.item()
+
+    return total_loss / len(batches)
+
+
+random.shuffle(train_batches)
+scheduler = StepLR(optimizer, step_size=2, gamma=0.8)
+for epoch in range(num_epochs):
+    random.shuffle(train_batches)
+    loss = train(model, train_batches, optimizer, criterion)
+    print('Epoch:', epoch, 'Loss:', loss)
+    torch.save({'epoch': epoch,
+                'model_state_dict': model.state_dict()},
+                'save_fever' + str(epoch))
+    scheduler.step()
@@ -0,0 +1,21 @@
+import numpy as np
+import pandas as pd
+import math
+
+data = pd.read_csv('./CoQA_withGoldenR.csv')
+
+inputs = data['BERT similarity']
+new_multiply=[]
+for i in inputs:
+    values = str(i)
+    values = values[1:5]
+    values = math.ceil(float(values)*100)
+    values = str(values)
+    new_multiply.append(values+"%")
+
+data['BERT similarity percentage'] = new_multiply
+del data['chatgpt_prompt']
+del data['answer_prompt']
+del data['BERT similarity']
+print(data.head)
+data.to_csv("CoQA_with_GR.csv", index=True)