Skip to content

Commit 5ae6be7

Browse files
committed
leaderboard at oct 13
1 parent ea88228 commit 5ae6be7

File tree

9 files changed

+176
-28
lines changed

9 files changed

+176
-28
lines changed

leaderboards/IQA_outputs/eval.ipynb

+157-19
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,14 @@
3838
]
3939
},
4040
{
41-
"cell_type": "code",
42-
"execution_count": null,
43-
"id": "0734d7a0-ad5d-43b6-b9ce-92cf924e2a1a",
41+
"cell_type": "markdown",
42+
"id": "13d12331-5d4b-422f-a376-9c321431036b",
4443
"metadata": {
4544
"tags": []
4645
},
47-
"outputs": [],
48-
"source": []
46+
"source": [
47+
"### Main Results"
48+
]
4949
},
5050
{
5151
"cell_type": "code",
@@ -73,11 +73,14 @@
7373
"|mplug_owl | 0.634/0.644 | 0.409/0.427 | 0.241/0.271 | 0.437/0.487 | 0.148/0.180 | 0.687/0.711 | 0.466/0.486| 0.432/0.458|\n",
7474
"|otter_v1 | 0.436/0.441 | 0.406/0.406 | 0.143/0.142 | -0.008/0.018 | 0.254/0.264 | 0.475/0.481 | 0.557/0.577| 0.323/0.333|\n",
7575
"|qwen-vl | 0.676/0.669 | 0.470/0.546 | 0.298/0.338 | 0.504/0.532 | 0.273/0.284 | 0.617/0.686 | 0.486/0.486| 0.475/0.506|\n",
76-
"|shikra | 0.327/0.337 | 0.314/0.307 | 0.222/0.227 | 0.322/0.336 | 0.198/0.201 | 0.640/0.661 | 0.324/0.332| 0.335/0.343|\n"
76+
"|shikra | 0.327/0.337 | 0.314/0.307 | 0.222/0.227 | 0.322/0.336 | 0.198/0.201 | 0.640/0.661 | 0.324/0.332| 0.335/0.343|\n",
77+
"|visualglm | 0.498/0.507 | 0.247/0.234 | 0.146/0.154 | 0.110/0.116 | 0.209/0.183 | 0.342/0.349 | 0.127/0.131| 0.240/0.239|\n"
7778
]
7879
}
7980
],
8081
"source": [
82+
"## Official Results\n",
83+
"\n",
8184
"import json, glob\n",
8285
"models = glob.glob(\"*/\")\n",
8386
"\n",
@@ -120,45 +123,172 @@
120123
" else:\n",
121124
" s = (spearmanr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d])[0])\n",
122125
" p = (pearsonr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d])[0])\n",
126+
" else:\n",
127+
" if json_ == json_prefix + \"cgi.json\":\n",
128+
" # as in paper\n",
129+
" d1, d2 = d[:3000], d[3000:6000]\n",
130+
" s = (spearmanr([float(di[\"gt_score\"]) for di in d1], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d1])[0])\n",
131+
" p = (pearsonr([float(di[\"gt_score\"]) for di in d1], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d1])[0])\n",
132+
" s += (spearmanr([float(di[\"gt_score\"]) for di in d2], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d2])[0])\n",
133+
" p += (pearsonr([float(di[\"gt_score\"]) for di in d2], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d2])[0])\n",
134+
" s /= 2\n",
135+
" p /= 2\n",
136+
" else:\n",
137+
" s = (spearmanr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d])[0])\n",
138+
" p = (pearsonr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d])[0])\n",
139+
" stri += \" | {:.3f}/{:.3f}\".format(s, p)\n",
140+
" avg_s += s\n",
141+
" avg_p += p\n",
123142
" \n",
143+
" print(\"|\"+stri+\"|\"+\" {:.3f}/{:.3f}|\".format(avg_s/7, avg_p/7))"
144+
]
145+
},
146+
{
147+
"cell_type": "markdown",
148+
"id": "6da07745-2572-4559-9f05-75e991282efd",
149+
"metadata": {},
150+
"source": [
151+
"#### What if we do not use the proposed softmax strategy in Q-Bench?"
152+
]
153+
},
154+
{
155+
"cell_type": "code",
156+
"execution_count": 4,
157+
"id": "c5b57fe9-30d2-461c-8d3b-4126c2e58829",
158+
"metadata": {
159+
"tags": []
160+
},
161+
"outputs": [
162+
{
163+
"name": "stdout",
164+
"output_type": "stream",
165+
"text": [
166+
"Results NaN/NaN means that the argmax(logit_good, logit_poor) is always one value, that the model constantly predict good/bad.\n",
167+
"| **Model Name**| SPAQ| KoNIQ-10k| LIVE-FB| LIVE-itw| CGIQA-6K| AGIQA-3K| KADID-10K| average| \n",
168+
"| -| -| -| -| -| -| -| -| -| \n",
169+
"|clip_vit_l14 | 0.269/0.269 | 0.383/0.427 | 0.163/0.185 | 0.246/0.226 | 0.030/0.031 | 0.167/0.191 | 0.271/0.272| 0.219/0.229|\n",
170+
"|idefics | 0.119/0.127 | 0.040/0.050 | 0.050/0.073 | 0.029/0.028 | 0.066/0.069 | 0.254/0.302 | 0.020/0.020| 0.083/0.096|\n"
171+
]
172+
},
173+
{
174+
"name": "stderr",
175+
"output_type": "stream",
176+
"text": [
177+
"/home/ps/anaconda3/lib/python3.11/site-packages/scipy/stats/_stats_py.py:4916: ConstantInputWarning: An input array is constant; the correlation coefficient is not defined.\n",
178+
" warnings.warn(stats.ConstantInputWarning(warn_msg))\n",
179+
"/home/ps/anaconda3/lib/python3.11/site-packages/scipy/stats/_stats_py.py:4424: ConstantInputWarning: An input array is constant; the correlation coefficient is not defined.\n",
180+
" warnings.warn(stats.ConstantInputWarning(msg))\n"
181+
]
182+
},
183+
{
184+
"name": "stdout",
185+
"output_type": "stream",
186+
"text": [
187+
"|instructblip_t5 | -0.010/-0.010 | 0.007/0.003 | 0.011/0.010 | -0.034/-0.033 | nan/nan | -0.015/-0.015 | 0.011/0.011| nan/nan|\n",
188+
"|instructblip_vicuna | 0.663/0.664 | 0.284/0.353 | 0.156/0.250 | 0.196/0.264 | 0.214/0.222 | 0.506/0.567 | 0.305/0.307| 0.332/0.375|\n",
189+
"|kosmos_2 | 0.533/0.535 | 0.074/0.085 | 0.084/0.095 | 0.152/0.173 | 0.065/0.066 | 0.159/0.182 | 0.186/0.186| 0.179/0.189|\n",
190+
"|llama_adapter_v2 | 0.417/0.423 | 0.218/0.237 | 0.223/0.257 | 0.205/0.239 | 0.200/0.200 | 0.545/0.579 | 0.228/0.230| 0.291/0.309|\n",
191+
"|llava_v1.5 | 0.481/0.484 | 0.311/0.341 | 0.244/0.270 | 0.306/0.346 | 0.228/0.227 | 0.607/0.667 | 0.251/0.253| 0.347/0.370|\n",
192+
"|llava_v1 | 0.101/0.108 | 0.038/0.045 | 0.036/0.055 | 0.059/0.075 | 0.066/0.079 | 0.240/0.297 | 0.051/0.051| 0.084/0.101|\n",
193+
"|minigpt4_13b | 0.009/0.010 | 0.009/0.013 | 0.019/0.019 | 0.025/0.035 | nan/nan | 0.055/0.066 | nan/nan| nan/nan|\n",
194+
"|mplug_owl | 0.463/0.469 | 0.111/0.154 | 0.081/0.124 | 0.170/0.237 | nan/nan | 0.410/0.466 | 0.203/0.204| nan/nan|\n",
195+
"|otter_v1 | 0.108/0.108 | 0.101/0.117 | 0.082/0.087 | -0.007/0.009 | 0.109/0.115 | 0.422/0.434 | 0.463/0.465| 0.183/0.191|\n",
196+
"|qwen-vl | 0.128/0.127 | 0.262/0.251 | 0.223/0.216 | 0.345/0.327 | 0.231/0.236 | 0.427/0.440 | 0.387/0.389| 0.286/0.284|\n",
197+
"|shikra | 0.277/0.281 | 0.178/0.202 | 0.152/0.170 | 0.248/0.267 | 0.093/0.100 | 0.513/0.563 | 0.245/0.246| 0.244/0.261|\n",
198+
"|visualglm | 0.415/0.418 | 0.139/0.138 | 0.088/0.091 | 0.051/0.044 | 0.055/0.057 | 0.300/0.319 | 0.063/0.063| 0.159/0.161|\n"
199+
]
200+
}
201+
],
202+
"source": [
203+
"## Ablation Results for Using ‘’‘Argmax’‘’ between \"good\" and \"poor\"\n",
204+
"\n",
205+
"print(\"Results NaN/NaN means that the argmax(logit_good, logit_poor) is always one value, that the model constantly predict good/bad.\")\n",
206+
"datasets = [\"\", \"**Model Name**\", \"SPAQ\",\"KoNIQ-10k\",\"LIVE-FB\",\"LIVE-itw\",\"CGIQA-6K\", \"AGIQA-3K\", \"KADID-10K\", \"average\", \"\"]\n",
207+
"print(\"| \".join(datasets))\n",
208+
"lst = [\"\"] + [\"-\" for i in datasets[1:-1]] + [\"\"]\n",
209+
"print(\"| \".join(lst))\n",
210+
"for json_prefix in sorted(models):\n",
211+
" jsons = [\n",
212+
" json_prefix + \"spaq.json\",\n",
213+
" json_prefix + \"koniq.json\",\n",
214+
" json_prefix + \"flive.json\",\n",
215+
" json_prefix + \"livec.json\",\n",
216+
" json_prefix + \"cgi.json\",\n",
217+
" json_prefix + \"agi.json\",\n",
218+
" json_prefix + \"kadid.json\",\n",
219+
" ]\n",
220+
" stri = json_prefix[:-1]\n",
221+
" avg_s, avg_p = 0., 0.\n",
222+
" for json_ in jsons:\n",
223+
" if not glob.glob(json_):\n",
224+
" print(json_)\n",
225+
" continue\n",
226+
" with open(json_) as f:\n",
227+
" s = f.read().replace(\"}{\", \"},{\")\n",
228+
" if s[0] != \"[\":\n",
229+
" s = \"[\" + s + \"]\"\n",
230+
" d = json.loads(s)\n",
231+
" if json_prefix == \"instructblip_t5/\":\n",
232+
" if json_ == json_prefix + \"cgi.json\":\n",
233+
" # as in paper\n",
234+
" d1, d2 = d[:3000], d[3000:6000]\n",
235+
" s = (spearmanr([float(di[\"gt_score\"]) for di in d1], [argmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d1])[0])\n",
236+
" p = (pearsonr([float(di[\"gt_score\"]) for di in d1], [argmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d1])[0])\n",
237+
" s += (spearmanr([float(di[\"gt_score\"]) for di in d2], [argmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d2])[0])\n",
238+
" p += (pearsonr([float(di[\"gt_score\"]) for di in d2], [argmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d2])[0])\n",
239+
" s /= 2\n",
240+
" p /= 2\n",
241+
" else:\n",
242+
" s = (spearmanr([float(di[\"gt_score\"]) for di in d], [argmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d])[0])\n",
243+
" p = (pearsonr([float(di[\"gt_score\"]) for di in d], [argmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d])[0])\n",
124244
" elif json_prefix == \"qwen-vl\":\n",
125245
" print('qw')\n",
126246
" if json_ == json_prefix + \"cgi.json\":\n",
127247
" # as in paper\n",
128248
" d1, d2 = d[:3000], d[3000:6000]\n",
129-
" s = (spearmanr([float(di[\"gt_score\"]) for di in d1], [softmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d1])[0])\n",
130-
" p = (pearsonr([float(di[\"gt_score\"]) for di in d1], [softmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d1])[0])\n",
131-
" s += (spearmanr([float(di[\"gt_score\"]) for di in d2], [softmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d2])[0])\n",
132-
" p += (pearsonr([float(di[\"gt_score\"]) for di in d2], [softmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d2])[0])\n",
249+
" s = (spearmanr([float(di[\"gt_score\"]) for di in d1], [argmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d1])[0])\n",
250+
" p = (pearsonr([float(di[\"gt_score\"]) for di in d1], [argmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d1])[0])\n",
251+
" s += (spearmanr([float(di[\"gt_score\"]) for di in d2], [argmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d2])[0])\n",
252+
" p += (pearsonr([float(di[\"gt_score\"]) for di in d2], [argmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d2])[0])\n",
133253
" s /= 2\n",
134254
" p /= 2\n",
135255
" else:\n",
136-
" s = (spearmanr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d])[0])\n",
137-
" p = (pearsonr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d])[0])\n",
256+
" s = (spearmanr([float(di[\"gt_score\"]) for di in d], [argmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d])[0])\n",
257+
" p = (pearsonr([float(di[\"gt_score\"]) for di in d], [argmax(di[\"logit_excellent\"], di[\"logit_poor\"]) for di in d])[0])\n",
138258
" \n",
139259
" else:\n",
140260
" if json_ == json_prefix + \"cgi.json\":\n",
141261
" # as in paper\n",
142262
" d1, d2 = d[:3000], d[3000:6000]\n",
143-
" s = (spearmanr([float(di[\"gt_score\"]) for di in d1], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d1])[0])\n",
144-
" p = (pearsonr([float(di[\"gt_score\"]) for di in d1], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d1])[0])\n",
145-
" s += (spearmanr([float(di[\"gt_score\"]) for di in d2], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d2])[0])\n",
146-
" p += (pearsonr([float(di[\"gt_score\"]) for di in d2], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d2])[0])\n",
263+
" s = (spearmanr([float(di[\"gt_score\"]) for di in d1], [argmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d1])[0])\n",
264+
" p = (pearsonr([float(di[\"gt_score\"]) for di in d1], [argmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d1])[0])\n",
265+
" s += (spearmanr([float(di[\"gt_score\"]) for di in d2], [argmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d2])[0])\n",
266+
" p += (pearsonr([float(di[\"gt_score\"]) for di in d2], [argmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d2])[0])\n",
147267
" s /= 2\n",
148268
" p /= 2\n",
149269
" else:\n",
150-
" s = (spearmanr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d])[0])\n",
151-
" p = (pearsonr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d])[0])\n",
270+
" s = (spearmanr([float(di[\"gt_score\"]) for di in d], [argmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d])[0])\n",
271+
" p = (pearsonr([float(di[\"gt_score\"]) for di in d], [argmax(di[\"logit_good\"], di[\"logit_poor\"]) for di in d])[0])\n",
152272
" stri += \" | {:.3f}/{:.3f}\".format(s, p)\n",
153273
" avg_s += s\n",
154274
" avg_p += p\n",
155275
" \n",
156276
" print(\"|\"+stri+\"|\"+\" {:.3f}/{:.3f}|\".format(avg_s/7, avg_p/7))"
157277
]
158278
},
279+
{
280+
"cell_type": "markdown",
281+
"id": "aa22635f-acb4-4e58-b532-f45395ce428a",
282+
"metadata": {
283+
"tags": []
284+
},
285+
"source": [
286+
"### What if we do not follow the LLMs' preferred output?"
287+
]
288+
},
159289
{
160290
"cell_type": "code",
161-
"execution_count": 4,
291+
"execution_count": 6,
162292
"id": "3d228f01-eda9-4f47-8506-bf227ebb142f",
163293
"metadata": {
164294
"tags": []
@@ -235,6 +365,14 @@
235365
" print(spearmanr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d])[0])\n",
236366
" print(pearsonr([float(di[\"gt_score\"]) for di in d], [softmax(di[\"logit_high\"], di[\"logit_low\"]) for di in d])[0])"
237367
]
368+
},
369+
{
370+
"cell_type": "code",
371+
"execution_count": null,
372+
"id": "fa5bac51-c353-4bab-848b-e786a57a92a6",
373+
"metadata": {},
374+
"outputs": [],
375+
"source": []
238376
}
239377
],
240378
"metadata": {

leaderboards/IQA_outputs/visualglm/agi.json

+1
Large diffs are not rendered by default.

leaderboards/IQA_outputs/visualglm/cgi.json

+1
Large diffs are not rendered by default.

leaderboards/IQA_outputs/visualglm/flive.json

+1
Large diffs are not rendered by default.

leaderboards/IQA_outputs/visualglm/kadid.json

+1
Large diffs are not rendered by default.

leaderboards/IQA_outputs/visualglm/koniq.json

+1
Large diffs are not rendered by default.

leaderboards/IQA_outputs/visualglm/livec.json

+1
Large diffs are not rendered by default.

leaderboards/IQA_outputs/visualglm/spaq.json

+1
Large diffs are not rendered by default.

leaderboards/README.md

+12-9
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Leaderboards
1+
# Hot Leaderboards @ Oct 14
22

33
<div align="center">
44

@@ -7,18 +7,18 @@ _Join the competition for low-level vision now!_
77
</div>
88

99
<div>
10-
_version_: v0.1.1012wip; _Timeliness_: Updated on 12nd Oct.
10+
_version_: v0.1.1013wip; _Timeliness_: Updated on 13rd Oct.
1111
</div>
1212

1313
</div>
1414

1515
## Leaderboards for (A1): Perception
1616

17-
*New! Result of LLaVA-v1.5/Qwen-VL-Chat is out!*
1817

1918
About the partition of `dev` and `test` subsets, please see [our dataset release notes](../data_release/). As some models excel on original testing pipeline while some others perform better under PPL-based testing, we maintain two leaderboards for two different testing methods. See [examples](../example_code_for_idefics) for their different settings.
2019

2120
### Original Testing Pipeline
21+
- 13 models tested
2222
- via Multi-Choice Questions
2323

2424
#### Accuracies on Open-set (`dev`)
@@ -37,7 +37,7 @@ About the partition of `dev` and `test` subsets, please see [our dataset release
3737
| otter_v1 | 0.5709 | 0.4071 | 0.3955 | 0.4222 | 0.4931 | 0.4408 | 0.5265 | 0.4635 |
3838
| qwen_vl | 0.6309 | 0.5819 | 0.5639 | 0.5058 | 0.6273 | 0.5789 | 0.7388 | 0.5940 |
3939
| shikra | 0.6564 | 0.4735 | 0.4909 | 0.4883 | 0.5949 | 0.5000 | 0.6408 | 0.5465 |
40-
40+
| visualglm | 0.6018 | 0.5420 | 0.4625 | 0.5175 | 0.5440 | 0.5362 | 0.5714 | 0.5378 |
4141

4242

4343

@@ -57,11 +57,14 @@ About the partition of `dev` and `test` subsets, please see [our dataset release
5757
| otter_v1 | 0.5766 | 0.3970 | 0.4259 | 0.4212 | 0.4893 | 0.4760 | 0.5417 | 0.4722 |
5858
| qwen_vl | 0.6533 | 0.6074 | 0.5844 | 0.5413 | 0.6635 | 0.5822 | 0.7300 | 0.6167 |
5959
| shikra | 0.6909 | 0.4793 | 0.4671 | 0.4731 | 0.6086 | 0.5308 | 0.6477 | 0.5532 |
60+
| visualglm | 0.6131 | 0.5358 | 0.4403 | 0.4856 | 0.5489 | 0.5548 | 0.5779 | 0.5331 |
61+
6062

6163
### (*Additional*) PPL-based Testing Pipeline
6264

65+
- 11 models tested
6366
- via Losses of Different Answers
64-
- *non-finalized in progress version, may update*
67+
- *non-finalized work-in-progress version, may update*
6568

6669
*No options are provided in prompts!*
6770

@@ -99,7 +102,6 @@ shikra | 0.6515 | 0.4729 | 0.5021 | 0.4269 | 0.6205 | 0.5034 | 0.7197 | 0.5478 |
99102

100103
## Leaderboards for (A2): Description
101104

102-
*New! Result of LLaVA-v1.5/Qwen-VL-Chat is out!*
103105

104106
Abbreviations for dimensions: *comp: completeness, prec: precision, rele: relevance*
105107

@@ -117,15 +119,15 @@ Abbreviations for dimensions: *comp: completeness, prec: precision, rele: releva
117119
| otter_v1 | 22.38% | 59.36% | 18.25% | 0.96/2.00 | 40.68% | 35.99% | 23.33% | 0.83/2.00 | 1.95% | 13.20% | 84.85% | 1.83/2.00 | 3.61/6.00 |
118120
| qwen_vl | 26.34% | 49.13% | 24.53% | 0.98/2.00 | 50.62% | 23.44% | 25.94% | 0.75/2.00 | 0.73% | 35.56% | 63.72% | 1.63/2.00 | 3.36/6.00 |
119121
| shikra | 21.14% | 68.33% | 10.52% | 0.89/2.00 | 30.33% | 28.30% | 41.37% | 1.11/2.00 | 1.14% | 64.36% | 34.50% | 1.33/2.00 | 3.34/6.00 |
120-
122+
| visualglm | 30.75% | 56.64% | 12.61% | 0.82/2.00 | 38.64% | 26.18% | 35.18% | 0.97/2.00 | 6.14% | 67.15% | 26.71% | 1.21/2.00 | 2.99/6.00 |
121123

122124

123125
## Leaderboards for (A3): Assessment
124126

125-
*New! Result of LLaVA-v1.5/QWen-VL-Chat is out!*
126-
127127
The datasets can be found [here](../a3_iqa_databases/).
128128

129+
See [IQA_outputs/eval.ipynb](IQA_outputs/eval.ipynb) for our ablation experiments.
130+
129131

130132
| **Model Name**| SPAQ| KoNIQ-10k| LIVE-FB| LIVE-itw| CGIQA-6K| AGIQA-3K| KADID-10K| average|
131133
| -| -| -| -| -| -| -| -| -|
@@ -142,6 +144,7 @@ The datasets can be found [here](../a3_iqa_databases/).
142144
|otter_v1 | 0.436/0.441 | 0.406/0.406 | 0.143/0.142 | -0.008/0.018 | 0.254/0.264 | 0.475/0.481 | 0.557/0.577| 0.323/0.333|
143145
|qwen-vl | 0.676/0.669 | **0.470/0.546** (rank 1) | 0.298/0.338 | **0.504/0.532** (rank 1) | 0.273/0.284 | 0.617/0.686 | **0.486/0.486** (rank 1) | **0.475/0.506** (rank 1) |
144146
|shikra | 0.327/0.337 | 0.314/0.307 | 0.222/0.227 | 0.322/0.336 | 0.198/0.201 | 0.640/0.661 | 0.324/0.332| 0.335/0.343|
147+
|visualglm | 0.498/0.507 | 0.247/0.234 | 0.146/0.154 | 0.110/0.116 | 0.209/0.183 | 0.342/0.349 | 0.127/0.131| 0.240/0.239|
145148

146149
Overall, `qwen-vl` has the best IQA performance among the models. (12st Oct); meanwhile, `llava-v1.5` (2nd rank overall) tops on AIGC/CGI images.
147150

0 commit comments

Comments
 (0)