Skip to content

Commit f232170

Browse files
committed
uploaded python files of processing and BERT QA
1 parent 1cf8095 commit f232170

9 files changed

+570560
-0
lines changed

BERT_similarity/SNLI_Corpus/snli_1.0_dev.csv

+10,001
Large diffs are not rendered by default.

BERT_similarity/SNLI_Corpus/snli_1.0_test.csv

+10,001
Large diffs are not rendered by default.

BERT_similarity/SNLI_Corpus/snli_1.0_train.csv

+550,153
Large diffs are not rendered by default.

BERT_similarity/pretrained_QA.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import pandas as pd
2+
import numpy as np
3+
4+
import torch
5+
import torch.nn as nn
6+
from transformers import BertForQuestionAnswering
7+
from transformers import BertTokenizer
8+
9+
10+
def data_processing_coqa(dataset):
11+
12+
coqa = pd.read_json(dataset)
13+
del coqa["version"]
14+
coqa_columns = ["text","question","answer"]
15+
new_data_list = []
16+
for index, row in coqa.iterrows():
17+
for i in range(len(row["data"]["questions"])):
18+
temp = []
19+
temp.append(row["data"]["story"])
20+
temp.append(row["data"]["questions"][i]["input_text"])
21+
temp.append(row["data"]["answers"][i]["input_text"])
22+
new_data_list.append(temp)
23+
coqa_new = pd.DataFrame(new_data_list, columns=coqa_columns)
24+
return coqa_new.to_csv("CoQA_dataset.csv", index=False)
25+
26+
27+
28+
def evaluating_QA_BERT(question, text):
29+
load_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
30+
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
31+
input_ids = tokenizer.encode(question, text)
32+
tokens = tokenizer.convert_ids_to_tokens(input_ids)
33+
seq_count = input_ids.index(tokenizer.sep_token_id)
34+
num_seg_a = seq_count+1
35+
num_seg_b = len(input_ids) - num_seg_a
36+
segment_ids = [0]*num_seg_a + [1]*num_seg_b
37+
assert len(segment_ids) == len(input_ids)
38+
39+
output = load_model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
40+
41+
answer_start = torch.argmax(output.start_logits)
42+
answer_end = torch.argmax(output.end_logits)
43+
44+
if answer_end >= answer_start:
45+
answer = tokens[answer_start]
46+
for i in range(answer_start+1, answer_end+1):
47+
if tokens[i][0:2] == "##":
48+
answer += tokens[i][2:]
49+
else:
50+
answer += " " + tokens[i]
51+
52+
if answer.startswith("[CLS]"):
53+
answer = "Unable to find the answer to your question."
54+
print("\nAnswer:\n{}".format(answer.capitalize()))
55+
56+
57+
text = """
58+
My name is Aman. I am 22 years old.
59+
I am an out-of-the-box thinker with excellent programming and analytical skills.
60+
I create deployable, learning-enabled, and resource-constrained systems.
61+
My research interests include computer vision, image processing, deep learning, and human-computer interaction.
62+
I love traveling to new places, eating different food, and connecting with new people. Tech enthusiast, explorer, and pacifist.
63+
"""
64+
question = "What are the research interests of Aman?"
65+
66+
# evaluating_QA_BERT(question, text)
67+

BERT_similarity/test_BERT.py

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import numpy as np
2+
import pandas as pd
3+
import tensorflow as tf
4+
import transformers
5+
from train_BERT import DataLoader
6+
7+
load_model = tf.keras.models.load_model('./sematic-similarity.h5')
8+
labels = ["contradiction", "entailment", "neutral"]
9+
def check_similarity(sentence1, sentence2):
10+
sentences = np.array([[str(sentence1), str(sentence2)]])
11+
test_data = DataLoader(sentences, labels=None, batch_size=1, shuffle=False, include_labels=False)
12+
13+
proba = load_model.predict(test_data[0])[0]
14+
idx = np.argmax(proba)
15+
proba = f"{proba[idx]: .2f}%"
16+
pred = labels[idx]
17+
print(pred)
18+
# print(proba)
19+
return proba, pred, sentence1, sentence2
20+
21+
df = pd.read_csv('../DialFact_withResponse.csv')
22+
# df = df.iloc[10:]
23+
# df["answer_prompt"] = df["context"].astype(str) + " " + df["question"].astype(str) + " The answer is "+ df["answer"].astype(str)
24+
# df["chatgpt_prompt"] = df["text"].astype(str) + " " + df["question"].astype(str) + " The answer is "+ df["chatgpt_response"].astype(str)
25+
26+
answer_prompt = []
27+
for i in df['response']:
28+
answer_prompt.append(i)
29+
30+
chatgpt_prompt = []
31+
for j in df['chatgpt_response']:
32+
chatgpt_prompt.append(i)
33+
34+
similarity_p=[]
35+
label_p=[]
36+
for i, j in zip(answer_prompt, chatgpt_prompt):
37+
similarity = check_similarity(i, j)[0]
38+
label = check_similarity(i, j)[1]
39+
print(label)
40+
similarity_p.append(similarity)
41+
label_p.append(label)
42+
43+
44+
df["BERT similarity"] = similarity_p
45+
df["NLI Label"] = label_p
46+
47+
df.to_csv("DialFact_withGoldenR_label.csv", index=True)

BERT_similarity/train_BERT.py

+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import numpy as np
2+
import pandas as pd
3+
import tensorflow as tf
4+
import transformers
5+
6+
7+
labels = ["contradiction", "entailment", "neutral"]
8+
9+
x_train = pd.read_csv("./SNLI_Corpus/snli_1.0_train.csv", nrows=100000)
10+
x_val = pd.read_csv("./SNLI_Corpus/snli_1.0_dev.csv")
11+
x_test = pd.read_csv("./SNLI_Corpus/snli_1.0_test.csv")
12+
x_train.dropna(axis=0, inplace=True)
13+
14+
x_train = (
15+
x_train[x_train.similarity != "-"]
16+
.sample(frac=1.0, random_state=42)
17+
.reset_index(drop=True)
18+
)
19+
x_val = (
20+
x_val[x_val.similarity != "-"]
21+
.sample(frac=1.0, random_state=42)
22+
.reset_index(drop=True)
23+
)
24+
25+
# print(f"Total train samples : {x_train.shape[0]}")
26+
# print(f"Total validation samples: {x_val.shape[0]}")
27+
# print(f"Total test samples: {x_val.shape[0]}")
28+
# print(x_train.similarity.value_counts())
29+
30+
x_train["label"] = x_train["similarity"].apply(lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2)
31+
x_val["label"] = x_val["similarity"].apply(lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2)
32+
x_test["label"] = x_test["similarity"].apply(lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2)
33+
34+
y_train = tf.keras.utils.to_categorical(x_train.label, num_classes=3)
35+
y_val = tf.keras.utils.to_categorical(x_val.label, num_classes=3)
36+
y_test = tf.keras.utils.to_categorical(x_test.label, num_classes=3)
37+
38+
39+
max_length = 128
40+
batch_size = 32
41+
epochs = 2
42+
43+
class DataLoader(tf.keras.utils.Sequence):
44+
"""Generates batches of data.
45+
46+
Args:
47+
sentences: Array of premise and hypothesis input sentences.
48+
labels: Array of labels.
49+
batch_size: Integer batch size.
50+
shuffle: boolean, whether to shuffle the data.
51+
include_labels: boolean, whether to incude the labels.
52+
53+
Returns:
54+
Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
55+
(or just `[input_ids, attention_mask, `token_type_ids]`
56+
if `include_labels=False`)
57+
"""
58+
59+
def __init__(
60+
self,
61+
sentences,
62+
labels,
63+
batch_size=batch_size,
64+
shuffle=True,
65+
include_labels=True,
66+
):
67+
self.sentences = sentences
68+
self.labels = labels
69+
self.shuffle = shuffle
70+
self.batch_size = batch_size
71+
self.include_labels = include_labels
72+
self.tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
73+
self.indexes = np.arange(len(self.sentences))
74+
self.shuffle_data()
75+
76+
def __len__(self):
77+
return len(self.sentences) // self.batch_size
78+
79+
def __getitem__(self, idx):
80+
indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
81+
sentences = self.sentences[indexes]
82+
encoded = self.tokenizer.batch_encode_plus(
83+
sentences.tolist(),
84+
add_special_tokens=True,
85+
max_length=max_length,
86+
return_attention_mask=True,
87+
return_token_type_ids=True,
88+
pad_to_max_length=True,
89+
return_tensors="tf",
90+
)
91+
92+
sentence_inputs = np.array(encoded["input_ids"], dtype="int32")
93+
attention_masks = np.array(encoded["attention_mask"], dtype="int32")
94+
token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")
95+
96+
if self.include_labels:
97+
labels = np.array(self.labels[indexes], dtype="int32")
98+
return [sentence_inputs, attention_masks, token_type_ids], labels
99+
else:
100+
return [sentence_inputs, attention_masks, token_type_ids]
101+
102+
def shuffle_data(self):
103+
if self.shuffle:
104+
np.random.RandomState(50).shuffle(self.indexes)
105+
106+
107+
sentence_inputs = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
108+
attention_masks = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="attention_masks")
109+
token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="token_type_ids")
110+
111+
bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
112+
bert_model.trainable = True
113+
bert_output = bert_model.bert(sentence_inputs, attention_mask=attention_masks, token_type_ids=token_type_ids)
114+
sequence_output = bert_output.last_hidden_state
115+
116+
# pooled_output = bert_output.pooler_output
117+
lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(sequence_output)
118+
avg_pool = tf.keras.layers.GlobalAveragePooling1D()(lstm)
119+
max_pool = tf.keras.layers.GlobalMaxPooling1D()(lstm)
120+
x = tf.keras.layers.concatenate([avg_pool, max_pool])
121+
x = tf.keras.layers.Dropout(0.3)(x)
122+
out = tf.keras.layers.Dense(3, activation="softmax")(x)
123+
model = tf.keras.models.Model(inputs=[sentence_inputs, attention_masks, token_type_ids], outputs=out)
124+
125+
# model.summary()
126+
127+
train_data = DataLoader(
128+
x_train[["sentence1", "sentence2"]].values.astype("str"),
129+
y_train,
130+
batch_size=batch_size,
131+
shuffle=True,
132+
)
133+
valid_data = DataLoader(
134+
x_val[["sentence1", "sentence2"]].values.astype("str"),
135+
y_val,
136+
batch_size=batch_size,
137+
shuffle=False,
138+
)
139+
test_data = DataLoader(
140+
x_test[["sentence1", "sentence2"]].values.astype("str"),
141+
y_test,
142+
batch_size=batch_size,
143+
shuffle=False,
144+
)
145+

fact_check.py

+96
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import json
2+
import torch
3+
import torch.nn as nn
4+
from torch.optim.lr_scheduler import StepLR
5+
import random
6+
from tqdm import tqdm
7+
from transformers import GPT2LMHeadModel, GPT2Tokenizer
8+
9+
num_epochs = 8
10+
supporting_texts = json.load(open('./supporting.json'))
11+
refuting_texts = json.load(open('./refuting.json'))
12+
13+
14+
split = 0.8
15+
16+
_prompt = '\n\n\nThe evidence supports the claim:\n'
17+
train_list = [item + _prompt + 'Yes.' for item in supporting_texts]
18+
train_list += [item + _prompt + 'Nope.' for item in refuting_texts]
19+
random.shuffle(train_list)
20+
21+
json.dump(train_list, open('./train_list.json', 'w'))
22+
23+
train_list = json.load(open('./train_list.json'))
24+
25+
print(train_list[0])
26+
27+
def chunks(lst, n):
28+
for i in range(0, len(lst), n):
29+
yield lst[i:i + n]
30+
31+
def batchify(data, n):
32+
len_dict = {}
33+
for item in data:
34+
length = item.shape[1]
35+
try:
36+
len_dict[length].append(item)
37+
except:
38+
len_dict[length] = [item]
39+
40+
batch_chunks = []
41+
for k in len_dict.keys():
42+
vectors = len_dict[k]
43+
batch_chunks += chunks(vectors, n)
44+
45+
batches = []
46+
for chunk in batch_chunks:
47+
inputs = torch.stack([item[0] for item in chunk])
48+
batches.append((inputs))
49+
50+
return batches
51+
52+
53+
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
54+
model = GPT2LMHeadModel.from_pretrained('gpt2')
55+
model.cuda()
56+
criterion = nn.BCEWithLogitsLoss()
57+
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
58+
59+
_limit = 1024
60+
data = []
61+
total_skipped = 0
62+
for item in train_list:
63+
tokens = tokenizer.encode(item, return_tensors='pt')
64+
if tokens.shape[1] > _limit:
65+
total_skipped += 1
66+
continue
67+
data.append(tokens)
68+
print(f'Skipped {total_skipped} out of {len(train_list)}')
69+
70+
train_batches = batchify(data, 1)
71+
72+
def train(train_model, batches, optimizer, criterion):
73+
total_loss = 0.
74+
for i, batch in tqdm(enumerate(batches), total=len(batches)):
75+
model.train()
76+
inputs = batch
77+
optimizer.zero_grad()
78+
loss = train_model(inputs.cuda(), labels=inputs.cuda())[0]
79+
loss.backward()
80+
torch.nn.utils.clip_grad_norm_(train_model.parameters(), 0.5)
81+
optimizer.step()
82+
total_loss += loss.item()
83+
84+
return total_loss / len(batches)
85+
86+
87+
random.shuffle(train_batches)
88+
scheduler = StepLR(optimizer, step_size=2, gamma=0.8)
89+
for epoch in range(num_epochs):
90+
random.shuffle(train_batches)
91+
loss = train(model, train_batches, optimizer, criterion)
92+
print('Epoch:', epoch, 'Loss:', loss)
93+
torch.save({'epoch': epoch,
94+
'model_state_dict': model.state_dict()},
95+
'save_fever' + str(epoch))
96+
scheduler.step()

post_process.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import numpy as np
2+
import pandas as pd
3+
import math
4+
5+
data = pd.read_csv('./CoQA_withGoldenR.csv')
6+
7+
inputs = data['BERT similarity']
8+
new_multiply=[]
9+
for i in inputs:
10+
values = str(i)
11+
values = values[1:5]
12+
values = math.ceil(float(values)*100)
13+
values = str(values)
14+
new_multiply.append(values+"%")
15+
16+
data['BERT similarity percentage'] = new_multiply
17+
del data['chatgpt_prompt']
18+
del data['answer_prompt']
19+
del data['BERT similarity']
20+
print(data.head)
21+
data.to_csv("CoQA_with_GR.csv", index=True)

0 commit comments

Comments
 (0)