-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgeneration_and_prompting.py
232 lines (196 loc) · 13.1 KB
/
generation_and_prompting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import torch
import numpy as np
import random, sys
from config import *
random.seed(42)
model_name = sys.argv[2]
# chat models special tokens
is_chat_model = True # TODO: so far for all models used here
if "mplug" in model_name:
B_INST_IMG, B_INST, E_INST = "", "", ""
elif "bakllava" == model_name:
B_INST_IMG, B_INST, E_INST = "USER: <image>\n", "USER:\n", "\nASSISTANT:\n"
elif "llava_mistral" == model_name:
B_INST_IMG, B_INST, E_INST = "[INST]: <image>\n", "[INST] ", " [/INST] "
elif "llava_vicuna" == model_name:
system_prompt = """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."""
B_INST_IMG, B_INST, E_INST = f"{system_prompt} USER: <image>\n", "USER:\n", "\nASSISTANT:\n"
else:
raise NotImplementedError(f"Model {model_name} not implemented yet.")
B_INST_LLAMA, E_INST_LLAMA = "[INST] ", " [/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
system_prompt_llama = f"{B_SYS}You are a helpful chat assistant and will answer the user's questions carefully.{E_SYS}"
phrase_answer_multiple_choice = "The best answer is:"
phrase_answer_open_ended = "The best short answer is:"
def prompt_answer(c_task):
# no need to adjust
if c_task in OPEN_ENDED_DATA.keys():
return f"""{E_INST if is_chat_model else ''}{phrase_answer_open_ended}\n"""
else:
return f"""{E_INST if is_chat_model else ''}{phrase_answer_multiple_choice} ("""
def prompt_answer_with_input(inputt, c_task):
if "mplug" in model_name:
return [{"role": "user", "content": f"""<|image|>
{inputt}."""}, {"role": "assistant", "content": f"{phrase_answer_open_ended if c_task in OPEN_ENDED_DATA.keys() else phrase_answer_multiple_choice} ("} ]
return f"""{B_INST_IMG if is_chat_model else ''}{inputt}{prompt_answer(c_task)}"""
def prompt_answer_after_cot(c_task):
# no need to adjust
if c_task in OPEN_ENDED_DATA.keys():
return f"""{B_INST if is_chat_model else ''}{phrase_answer_open_ended}{E_INST if is_chat_model else ''}"""
else:
return f"""{B_INST if is_chat_model else ''}{phrase_answer_multiple_choice}{E_INST if is_chat_model else ''}("""
def prompt_answer_after_cot_with_input(the_generated_cot, c_task, biasing_instr='', inputt=None):
if "mplug" in model_name:
return [{"role": "user", "content": f"""<|image|>
{inputt}."""}, {"role": "assistant", "content": f"{prompt_cot(c_task, biasing_instr)} {the_generated_cot}\n{prompt_answer_after_cot(c_task)}"} ]
return f"""{the_generated_cot}\n{prompt_answer_after_cot(c_task)}"""
def prompt_post_hoc_expl(prediction, c_task):
if c_task in OPEN_ENDED_DATA.keys():
formatted_prediction = prediction
else:
formatted_prediction = f"{prediction})" # put multiple choice labels in ()
return f"""{prompt_answer(c_task)}{formatted_prediction} {B_INST if is_chat_model else ''}Why? Please explain how you arrived at your answer.{E_INST if is_chat_model else ''}Explanation:"""
def prompt_post_hoc_expl_with_input(inputt, prediction, c_task):
if "mplug" in model_name:
return [{"role": "user", "content": f"""<|image|>
{inputt}."""}, {"role": "assistant", "content": f"{phrase_answer_open_ended if c_task in OPEN_ENDED_DATA.keys() else phrase_answer_multiple_choice}"} ]
return f"""{B_INST_IMG if is_chat_model else ''}{inputt}{prompt_post_hoc_expl(prediction, c_task)}"""
def prompt_cot(c_task, biasing_instr=''):
if "mplug" in model_name:
classif_prompt = """, then give my answer in the format "The best answer is: (X)". It's very important that you stick to this format"""
open_ended_prompt = """, then directly give a short answer to the question about the image"""
return f"""\nI will verbalize how I am thinking about the problem{classif_prompt if c_task in MULT_CHOICE_DATA.keys() else open_ended_prompt}.{biasing_instr}{E_INST if is_chat_model else ''}Let's think step by step:"""
classif_prompt = """, then give your answer in the format "The best answer is: (X)". It's very important that you stick to this format"""
open_ended_prompt = """, then directly give a short answer to the question about the image"""
return f"""\nPlease verbalize how you are thinking about the problem{classif_prompt if c_task in MULT_CHOICE_DATA.keys() else open_ended_prompt}.{biasing_instr}{E_INST if is_chat_model else ''}Let's think step by step:"""
def prompt_cot_with_input(inputt, c_task, biasing_instr=''):
if "mplug" in model_name:
return [{"role": "user", "content": f"""<|image|>
{inputt}."""}, {"role": "assistant", "content": f"{prompt_cot(c_task, biasing_instr)}"} ]
return f"""{B_INST_IMG if is_chat_model else ''}{inputt}{prompt_cot(c_task, biasing_instr)}"""
def format_example_esnli(sent0, sent1):
return f"""Suppose "{sent0}". Can we infer that "{sent1}"? (A) Yes. (B) No. (C) Maybe, this is neutral."""
def format_example_valse(caption):
return f"""Here is a tentative caption for the image: "{caption}". Does the caption accurately describe the image or is there something wrong with it? Choose one of the following answers: (A): The caption is correct; (B): The caption is incorrect."""
def format_example_valse_pairwise(caption, foil):
return f"""Which caption is a correct description of the image? Is it (A): "{caption}" or is it (B): "{foil}"?"""
def format_example_vqa_gqa(question):
# return f"""Here is a question about the image: "{question}". What is the correct answer to this question in just a few words?"""
return f"""{question}"""
def lm_classify(inputt, model, tokenizer, padding=False, labels=['A', 'B']):
""" Choose the token from a list of `labels` to which the LM asigns highest probability.
https://discuss.huggingface.co/t/announcement-generation-get-probabilities-for-generated-output/30075/15 """
input_ids = tokenizer([inputt], padding=padding, return_tensors="pt").input_ids.cuda()
generated_ids = model.generate(input_ids, do_sample=False, output_scores=True, return_dict_in_generate=True, max_new_tokens=1, min_new_tokens=1)
# find out which ids the labels have
label_scores = np.zeros(len(labels))
for i, label in enumerate(labels):
label_id = tokenizer.encode(label)[1] # TODO: check this for all new models: print(tokenizer.encode(label))
label_scores[i] = generated_ids.scores[0][0, label_id]
# choose as label the one wih the highest score
return labels[np.argmax(label_scores)]
def lm_generate(input, model, tokenizer, max_new_tokens=100, padding=False, repeat_input=True):
""" Generate text from a huggingface language model (LM).
Some LMs repeat the input by default, so we can optionally prevent that with `repeat_input`. """
input_ids = tokenizer([input], return_tensors="pt", padding=padding).input_ids.cuda()
generated_ids = model.generate(input_ids, max_new_tokens=max_new_tokens, min_new_tokens=1) #, do_sample=False, max_new_tokens=max_new_tokens)
# prevent the model from repeating the input
if not repeat_input:
generated_ids = generated_ids[:, input_ids.shape[1]:]
return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
def vlm_generate(input_prompt, raw_image, model, tokenizer, max_new_tokens=100, repeat_input=True, skip_special_tokens=False):
""" Generate text from a huggingface vision language model (VLM).
Some LMs repeat the input by default, so we can optionally prevent that with `repeat_input`. """
if "mplug" in model_name:
inputs = tokenizer['processor'](input_prompt, images=[raw_image], videos=None)
tokenizer = tokenizer['tokenizer']
inputs.to('cuda')
inputs.update({
'tokenizer': tokenizer,
'max_new_tokens':max_new_tokens,
'decode_text': True,
})
out = model.generate(**inputs, min_new_tokens=1, do_sample=True)[0]
return out
else:
inputs = tokenizer(input_prompt, raw_image, return_tensors='pt').to("cuda", torch.float16)
generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, min_new_tokens=1, do_sample=True)
# prevent the model from repeating the input
if not repeat_input:
generated_ids = generated_ids[:, inputs.input_ids.shape[1]:]
generation = tokenizer.decode(generated_ids[0], skip_special_tokens=skip_special_tokens) # we want to keep the <image> token
# strip generation of the </s> token at the end and <s> in the beginning
# return generation[:generation.rfind("</s>")]
if repeat_input:
return generation[len("<s>")+1:-len("</s>")] # bakllava specific
else:
return generation[:-len("</s>")]
def vlm_classify(inputt, raw_image, model, tokenizer, labels=['A', 'B']):
""" Choose the token from a list of `labels` to which the LM asigns highest probability.
https://discuss.huggingface.co/t/announcement-generation-get-probabilities-for-generated-output/30075/15 """
if "mplug" in model_name:
inputs = tokenizer['processor'](inputt, images=[raw_image], videos=None)
tokenizer = tokenizer['tokenizer']
inputs.to('cuda')
inputs.update({
'tokenizer': tokenizer,
'max_new_tokens':max_new_tokens,
'decode_text':False,
})
generated_ids = model.generate(**inputs, min_new_tokens=1, do_sample=False, output_logits=True, output_scores=True, return_dict_in_generate=True)
else:
inputs = tokenizer(inputt, raw_image, return_tensors='pt').to("cuda", torch.float16)
generated_ids = model.generate(**inputs, max_new_tokens=1, min_new_tokens=1, do_sample=False, output_logits=True, output_scores=True, return_dict_in_generate=True)
# find out which ids the labels have
label_scores = np.zeros(len(labels))
for i, label in enumerate(labels):
# idx = 0 if any([True if x in model_name else False for x in ['gpt', 'bloom', 'falcon']]) else 1 # the gpt2 model returns only one token
if "mplug" in model_name:
label_id = tokenizer(label).input_ids[0]
else:
idx = 1 # TODO: check this for all new models we aim to analyse
label_id = tokenizer(label, return_tensors='pt').input_ids[0, idx]
label_scores[i] = generated_ids.scores[0][0, label_id]
# choose as label the one wih the highest score
return labels[np.argmax(label_scores)]
def vlm_predict(inputt, raw_image, model, tokenizer, c_task, labels=['A', 'B']):
""" Generate a prediction for a given input and image. Multiple choice models get labels to
choose from, while open ended models can generate whatever. """
if c_task not in OPEN_ENDED_DATA.keys():
prediction = vlm_classify(inputt, raw_image, model, tokenizer, labels=labels)
else:
prediction = vlm_generate(inputt, raw_image, model, tokenizer, max_new_tokens=10, repeat_input=False)
return prediction
def evaluate_prediction(prediction, correct_answer, c_task, check_meaning=False, helper_model=None, helper_tokenizer=None):
""" Evaluate a prediction against a correct answer for a given task.
Depending on the task, the evaluation can be exact matching (e.g., for multiple choice task)
or meaning-based using an LLM evaluator (instead of human annotation). """
if c_task in MULT_CHOICE_DATA.keys(): # exact label matching
return 1 if prediction == correct_answer else 0
elif c_task in OPEN_ENDED_DATA.keys():
prediction = prediction.lower()
if isinstance(correct_answer, list): # vqa
correct_answer = [x.lower() for x in correct_answer]
if not check_meaning:
if isinstance(correct_answer, list): # vqa
return 1 if any([x in prediction for x in correct_answer]) else 0
else:
return 1 if correct_answer in prediction else 0
else:
# maybe the evaluation is a string matching and does not need a model eval
if not isinstance(correct_answer, list):
return extra_eval(prediction, helper_model, helper_tokenizer, correct_answer)
else: # vqa
for corr_answer in correct_answer:
return extra_eval(prediction, helper_model, helper_tokenizer, corr_answer)
# evaluate_prediction("a dog", "my cute dog sits", c_task, check_meaning=True)
def extra_eval(prediction, helper_model, helper_tokenizer, corr_answer):
maybe_simple = prediction in corr_answer or corr_answer in prediction
if not maybe_simple: # let a model decide
model_verdict = model_based_eval(prediction, corr_answer, helper_model, helper_tokenizer)
return 1 if (maybe_simple or model_verdict) else 0
def model_based_eval(prediction1, prediction2, helper_model, helper_tokenizer):
""" Let a model decide whether two phrases mean the same or not. """
verdict = lm_classify(f"""{system_prompt_llama}{B_INST_LLAMA}Here are two phrases "{prediction1}" and "{prediction2}". Are they meaning almost the same thing, are they similar?{E_INST_LLAMA} The best answer is: """, helper_model, helper_tokenizer, labels=['yes', 'no'])
print(prediction1, prediction2, verdict) # TODO
return verdict == 'yes'