|
38 | 38 | ]
|
39 | 39 | },
|
40 | 40 | {
|
41 |
| - "cell_type": "code", |
42 |
| - "execution_count": null, |
43 |
| - "id": "0734d7a0-ad5d-43b6-b9ce-92cf924e2a1a", |
| 41 | + "cell_type": "markdown", |
| 42 | + "id": "13d12331-5d4b-422f-a376-9c321431036b", |
44 | 43 | "metadata": {
|
45 | 44 | "tags": []
|
46 | 45 | },
|
47 |
| - "outputs": [], |
48 |
| - "source": [] |
| 46 | + "source": [ |
| 47 | + "### Main Results" |
| 48 | + ] |
49 | 49 | },
|
50 | 50 | {
|
51 | 51 | "cell_type": "code",
|
|
73 | 73 | "|mplug_owl | 0.634/0.644 | 0.409/0.427 | 0.241/0.271 | 0.437/0.487 | 0.148/0.180 | 0.687/0.711 | 0.466/0.486| 0.432/0.458|\n",
|
74 | 74 | "|otter_v1 | 0.436/0.441 | 0.406/0.406 | 0.143/0.142 | -0.008/0.018 | 0.254/0.264 | 0.475/0.481 | 0.557/0.577| 0.323/0.333|\n",
|
75 | 75 | "|qwen-vl | 0.676/0.669 | 0.470/0.546 | 0.298/0.338 | 0.504/0.532 | 0.273/0.284 | 0.617/0.686 | 0.486/0.486| 0.475/0.506|\n",
|
76 |
| - "|shikra | 0.327/0.337 | 0.314/0.307 | 0.222/0.227 | 0.322/0.336 | 0.198/0.201 | 0.640/0.661 | 0.324/0.332| 0.335/0.343|\n" |
| 76 | + "|shikra | 0.327/0.337 | 0.314/0.307 | 0.222/0.227 | 0.322/0.336 | 0.198/0.201 | 0.640/0.661 | 0.324/0.332| 0.335/0.343|\n", |
| 77 | + "|visualglm | 0.498/0.507 | 0.247/0.234 | 0.146/0.154 | 0.110/0.116 | 0.209/0.183 | 0.342/0.349 | 0.127/0.131| 0.240/0.239|\n" |
77 | 78 | ]
|
78 | 79 | }
|
79 | 80 | ],
|
80 | 81 | "source": [
|
| 82 | + "## Official Results\n", |
| 83 | + "\n", |
81 | 84 | "import json, glob\n",
|
82 | 85 | "models = glob.glob(\"*/\")\n",
|
83 | 86 | "\n",
|
|
120 | 123 | " else:\n",
|
121 | 124 | " s = (spearmanr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d])[0])\n",
|
122 | 125 | " p = (pearsonr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d])[0])\n",
|
| 126 | + " else:\n", |
| 127 | + " if json_ == json_prefix + \"cgi.json\":\n", |
| 128 | + " # as in paper\n", |
| 129 | + " d1, d2 = d[:3000], d[3000:6000]\n", |
| 130 | + " s = (spearmanr([float(di[\"gt_score\"]) for di in d1], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d1])[0])\n", |
| 131 | + " p = (pearsonr([float(di[\"gt_score\"]) for di in d1], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d1])[0])\n", |
| 132 | + " s += (spearmanr([float(di[\"gt_score\"]) for di in d2], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d2])[0])\n", |
| 133 | + " p += (pearsonr([float(di[\"gt_score\"]) for di in d2], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d2])[0])\n", |
| 134 | + " s /= 2\n", |
| 135 | + " p /= 2\n", |
| 136 | + " else:\n", |
| 137 | + " s = (spearmanr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d])[0])\n", |
| 138 | + " p = (pearsonr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d])[0])\n", |
| 139 | + " stri += \" | {:.3f}/{:.3f}\".format(s, p)\n", |
| 140 | + " avg_s += s\n", |
| 141 | + " avg_p += p\n", |
123 | 142 | " \n",
|
| 143 | + " print(\"|\"+stri+\"|\"+\" {:.3f}/{:.3f}|\".format(avg_s/7, avg_p/7))" |
| 144 | + ] |
| 145 | + }, |
| 146 | + { |
| 147 | + "cell_type": "markdown", |
| 148 | + "id": "6da07745-2572-4559-9f05-75e991282efd", |
| 149 | + "metadata": {}, |
| 150 | + "source": [ |
| 151 | + "#### What if we do not use the proposed softmax strategy in Q-Bench?" |
| 152 | + ] |
| 153 | + }, |
| 154 | + { |
| 155 | + "cell_type": "code", |
| 156 | + "execution_count": 4, |
| 157 | + "id": "c5b57fe9-30d2-461c-8d3b-4126c2e58829", |
| 158 | + "metadata": { |
| 159 | + "tags": [] |
| 160 | + }, |
| 161 | + "outputs": [ |
| 162 | + { |
| 163 | + "name": "stdout", |
| 164 | + "output_type": "stream", |
| 165 | + "text": [ |
| 166 | + "Results NaN/NaN means that the argmax(logit_good, logit_poor) is always one value, that the model constantly predict good/bad.\n", |
| 167 | + "| **Model Name**| SPAQ| KoNIQ-10k| LIVE-FB| LIVE-itw| CGIQA-6K| AGIQA-3K| KADID-10K| average| \n", |
| 168 | + "| -| -| -| -| -| -| -| -| -| \n", |
| 169 | + "|clip_vit_l14 | 0.269/0.269 | 0.383/0.427 | 0.163/0.185 | 0.246/0.226 | 0.030/0.031 | 0.167/0.191 | 0.271/0.272| 0.219/0.229|\n", |
| 170 | + "|idefics | 0.119/0.127 | 0.040/0.050 | 0.050/0.073 | 0.029/0.028 | 0.066/0.069 | 0.254/0.302 | 0.020/0.020| 0.083/0.096|\n" |
| 171 | + ] |
| 172 | + }, |
| 173 | + { |
| 174 | + "name": "stderr", |
| 175 | + "output_type": "stream", |
| 176 | + "text": [ |
| 177 | + "/home/ps/anaconda3/lib/python3.11/site-packages/scipy/stats/_stats_py.py:4916: ConstantInputWarning: An input array is constant; the correlation coefficient is not defined.\n", |
| 178 | + " warnings.warn(stats.ConstantInputWarning(warn_msg))\n", |
| 179 | + "/home/ps/anaconda3/lib/python3.11/site-packages/scipy/stats/_stats_py.py:4424: ConstantInputWarning: An input array is constant; the correlation coefficient is not defined.\n", |
| 180 | + " warnings.warn(stats.ConstantInputWarning(msg))\n" |
| 181 | + ] |
| 182 | + }, |
| 183 | + { |
| 184 | + "name": "stdout", |
| 185 | + "output_type": "stream", |
| 186 | + "text": [ |
| 187 | + "|instructblip_t5 | -0.010/-0.010 | 0.007/0.003 | 0.011/0.010 | -0.034/-0.033 | nan/nan | -0.015/-0.015 | 0.011/0.011| nan/nan|\n", |
| 188 | + "|instructblip_vicuna | 0.663/0.664 | 0.284/0.353 | 0.156/0.250 | 0.196/0.264 | 0.214/0.222 | 0.506/0.567 | 0.305/0.307| 0.332/0.375|\n", |
| 189 | + "|kosmos_2 | 0.533/0.535 | 0.074/0.085 | 0.084/0.095 | 0.152/0.173 | 0.065/0.066 | 0.159/0.182 | 0.186/0.186| 0.179/0.189|\n", |
| 190 | + "|llama_adapter_v2 | 0.417/0.423 | 0.218/0.237 | 0.223/0.257 | 0.205/0.239 | 0.200/0.200 | 0.545/0.579 | 0.228/0.230| 0.291/0.309|\n", |
| 191 | + "|llava_v1.5 | 0.481/0.484 | 0.311/0.341 | 0.244/0.270 | 0.306/0.346 | 0.228/0.227 | 0.607/0.667 | 0.251/0.253| 0.347/0.370|\n", |
| 192 | + "|llava_v1 | 0.101/0.108 | 0.038/0.045 | 0.036/0.055 | 0.059/0.075 | 0.066/0.079 | 0.240/0.297 | 0.051/0.051| 0.084/0.101|\n", |
| 193 | + "|minigpt4_13b | 0.009/0.010 | 0.009/0.013 | 0.019/0.019 | 0.025/0.035 | nan/nan | 0.055/0.066 | nan/nan| nan/nan|\n", |
| 194 | + "|mplug_owl | 0.463/0.469 | 0.111/0.154 | 0.081/0.124 | 0.170/0.237 | nan/nan | 0.410/0.466 | 0.203/0.204| nan/nan|\n", |
| 195 | + "|otter_v1 | 0.108/0.108 | 0.101/0.117 | 0.082/0.087 | -0.007/0.009 | 0.109/0.115 | 0.422/0.434 | 0.463/0.465| 0.183/0.191|\n", |
| 196 | + "|qwen-vl | 0.128/0.127 | 0.262/0.251 | 0.223/0.216 | 0.345/0.327 | 0.231/0.236 | 0.427/0.440 | 0.387/0.389| 0.286/0.284|\n", |
| 197 | + "|shikra | 0.277/0.281 | 0.178/0.202 | 0.152/0.170 | 0.248/0.267 | 0.093/0.100 | 0.513/0.563 | 0.245/0.246| 0.244/0.261|\n", |
| 198 | + "|visualglm | 0.415/0.418 | 0.139/0.138 | 0.088/0.091 | 0.051/0.044 | 0.055/0.057 | 0.300/0.319 | 0.063/0.063| 0.159/0.161|\n" |
| 199 | + ] |
| 200 | + } |
| 201 | + ], |
| 202 | + "source": [ |
| 203 | + "## Ablation Results for Using ‘’‘Argmax’‘’ between \"good\" and \"poor\"\n", |
| 204 | + "\n", |
| 205 | + "print(\"Results NaN/NaN means that the argmax(logit_good, logit_poor) is always one value, that the model constantly predict good/bad.\")\n", |
| 206 | + "datasets = [\"\", \"**Model Name**\", \"SPAQ\",\"KoNIQ-10k\",\"LIVE-FB\",\"LIVE-itw\",\"CGIQA-6K\", \"AGIQA-3K\", \"KADID-10K\", \"average\", \"\"]\n", |
| 207 | + "print(\"| \".join(datasets))\n", |
| 208 | + "lst = [\"\"] + [\"-\" for i in datasets[1:-1]] + [\"\"]\n", |
| 209 | + "print(\"| \".join(lst))\n", |
| 210 | + "for json_prefix in sorted(models):\n", |
| 211 | + " jsons = [\n", |
| 212 | + " json_prefix + \"spaq.json\",\n", |
| 213 | + " json_prefix + \"koniq.json\",\n", |
| 214 | + " json_prefix + \"flive.json\",\n", |
| 215 | + " json_prefix + \"livec.json\",\n", |
| 216 | + " json_prefix + \"cgi.json\",\n", |
| 217 | + " json_prefix + \"agi.json\",\n", |
| 218 | + " json_prefix + \"kadid.json\",\n", |
| 219 | + " ]\n", |
| 220 | + " stri = json_prefix[:-1]\n", |
| 221 | + " avg_s, avg_p = 0., 0.\n", |
| 222 | + " for json_ in jsons:\n", |
| 223 | + " if not glob.glob(json_):\n", |
| 224 | + " print(json_)\n", |
| 225 | + " continue\n", |
| 226 | + " with open(json_) as f:\n", |
| 227 | + " s = f.read().replace(\"}{\", \"},{\")\n", |
| 228 | + " if s[0] != \"[\":\n", |
| 229 | + " s = \"[\" + s + \"]\"\n", |
| 230 | + " d = json.loads(s)\n", |
| 231 | + " if json_prefix == \"instructblip_t5/\":\n", |
| 232 | + " if json_ == json_prefix + \"cgi.json\":\n", |
| 233 | + " # as in paper\n", |
| 234 | + " d1, d2 = d[:3000], d[3000:6000]\n", |
| 235 | + " s = (spearmanr([float(di[\"gt_score\"]) for di in d1], [argmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d1])[0])\n", |
| 236 | + " p = (pearsonr([float(di[\"gt_score\"]) for di in d1], [argmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d1])[0])\n", |
| 237 | + " s += (spearmanr([float(di[\"gt_score\"]) for di in d2], [argmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d2])[0])\n", |
| 238 | + " p += (pearsonr([float(di[\"gt_score\"]) for di in d2], [argmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d2])[0])\n", |
| 239 | + " s /= 2\n", |
| 240 | + " p /= 2\n", |
| 241 | + " else:\n", |
| 242 | + " s = (spearmanr([float(di[\"gt_score\"]) for di in d], [argmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d])[0])\n", |
| 243 | + " p = (pearsonr([float(di[\"gt_score\"]) for di in d], [argmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d])[0])\n", |
124 | 244 | " elif json_prefix == \"qwen-vl\":\n",
|
125 | 245 | " print('qw')\n",
|
126 | 246 | " if json_ == json_prefix + \"cgi.json\":\n",
|
127 | 247 | " # as in paper\n",
|
128 | 248 | " d1, d2 = d[:3000], d[3000:6000]\n",
|
129 |
| - " s = (spearmanr([float(di[\"gt_score\"]) for di in d1], [softmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d1])[0])\n", |
130 |
| - " p = (pearsonr([float(di[\"gt_score\"]) for di in d1], [softmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d1])[0])\n", |
131 |
| - " s += (spearmanr([float(di[\"gt_score\"]) for di in d2], [softmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d2])[0])\n", |
132 |
| - " p += (pearsonr([float(di[\"gt_score\"]) for di in d2], [softmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d2])[0])\n", |
| 249 | + " s = (spearmanr([float(di[\"gt_score\"]) for di in d1], [argmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d1])[0])\n", |
| 250 | + " p = (pearsonr([float(di[\"gt_score\"]) for di in d1], [argmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d1])[0])\n", |
| 251 | + " s += (spearmanr([float(di[\"gt_score\"]) for di in d2], [argmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d2])[0])\n", |
| 252 | + " p += (pearsonr([float(di[\"gt_score\"]) for di in d2], [argmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d2])[0])\n", |
133 | 253 | " s /= 2\n",
|
134 | 254 | " p /= 2\n",
|
135 | 255 | " else:\n",
|
136 |
| - " s = (spearmanr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d])[0])\n", |
137 |
| - " p = (pearsonr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d])[0])\n", |
| 256 | + " s = (spearmanr([float(di[\"gt_score\"]) for di in d], [argmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d])[0])\n", |
| 257 | + " p = (pearsonr([float(di[\"gt_score\"]) for di in d], [argmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d])[0])\n", |
138 | 258 | " \n",
|
139 | 259 | " else:\n",
|
140 | 260 | " if json_ == json_prefix + \"cgi.json\":\n",
|
141 | 261 | " # as in paper\n",
|
142 | 262 | " d1, d2 = d[:3000], d[3000:6000]\n",
|
143 |
| - " s = (spearmanr([float(di[\"gt_score\"]) for di in d1], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d1])[0])\n", |
144 |
| - " p = (pearsonr([float(di[\"gt_score\"]) for di in d1], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d1])[0])\n", |
145 |
| - " s += (spearmanr([float(di[\"gt_score\"]) for di in d2], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d2])[0])\n", |
146 |
| - " p += (pearsonr([float(di[\"gt_score\"]) for di in d2], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d2])[0])\n", |
| 263 | + " s = (spearmanr([float(di[\"gt_score\"]) for di in d1], [argmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d1])[0])\n", |
| 264 | + " p = (pearsonr([float(di[\"gt_score\"]) for di in d1], [argmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d1])[0])\n", |
| 265 | + " s += (spearmanr([float(di[\"gt_score\"]) for di in d2], [argmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d2])[0])\n", |
| 266 | + " p += (pearsonr([float(di[\"gt_score\"]) for di in d2], [argmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d2])[0])\n", |
147 | 267 | " s /= 2\n",
|
148 | 268 | " p /= 2\n",
|
149 | 269 | " else:\n",
|
150 |
| - " s = (spearmanr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d])[0])\n", |
151 |
| - " p = (pearsonr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d])[0])\n", |
| 270 | + " s = (spearmanr([float(di[\"gt_score\"]) for di in d], [argmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d])[0])\n", |
| 271 | + " p = (pearsonr([float(di[\"gt_score\"]) for di in d], [argmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d])[0])\n", |
152 | 272 | " stri += \" | {:.3f}/{:.3f}\".format(s, p)\n",
|
153 | 273 | " avg_s += s\n",
|
154 | 274 | " avg_p += p\n",
|
155 | 275 | " \n",
|
156 | 276 | " print(\"|\"+stri+\"|\"+\" {:.3f}/{:.3f}|\".format(avg_s/7, avg_p/7))"
|
157 | 277 | ]
|
158 | 278 | },
|
| 279 | + { |
| 280 | + "cell_type": "markdown", |
| 281 | + "id": "aa22635f-acb4-4e58-b532-f45395ce428a", |
| 282 | + "metadata": { |
| 283 | + "tags": [] |
| 284 | + }, |
| 285 | + "source": [ |
| 286 | + "### What if we do not follow the LLMs' preferred output?" |
| 287 | + ] |
| 288 | + }, |
159 | 289 | {
|
160 | 290 | "cell_type": "code",
|
161 |
| - "execution_count": 4, |
| 291 | + "execution_count": 6, |
162 | 292 | "id": "3d228f01-eda9-4f47-8506-bf227ebb142f",
|
163 | 293 | "metadata": {
|
164 | 294 | "tags": []
|
|
235 | 365 | " print(spearmanr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d])[0])\n",
|
236 | 366 | " print(pearsonr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d])[0])"
|
237 | 367 | ]
|
| 368 | + }, |
| 369 | + { |
| 370 | + "cell_type": "code", |
| 371 | + "execution_count": null, |
| 372 | + "id": "fa5bac51-c353-4bab-848b-e786a57a92a6", |
| 373 | + "metadata": {}, |
| 374 | + "outputs": [], |
| 375 | + "source": [] |
238 | 376 | }
|
239 | 377 | ],
|
240 | 378 | "metadata": {
|
|
0 commit comments