generated from eliahuhorwitz/Academic-project-page-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
269 lines (227 loc) · 12.6 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
<!-- Replace the content tag with appropriate information -->
<meta name="description"
content="Introducing PCM-Net for zero-shot image captioning by leveraging a novel Patch-wise Cross-modal feature Mix-up mechanism and CLIP-weighted cross-entropy loss, significantly outperforming state-of-the-art methods on MSCOCO and Flickr30k datasets.">
<meta property="og:title" content="Unleashing Text-to-Image Diffusion Prior for Zero-Shot Image Captioning" />
<meta property="og:description"
content="Discover PCM-Net's novel approach to zero-shot image captioning, achieving state-of-the-art results by synthesizing image-caption pairs with fewer defective elements and prioritizing high-quality data." />
<meta property="og:url" content="https://jianjieluo.github.io/SynthImgCap" />
<!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
<meta property="og:image" content="static/image/pcm_net_banner.png" />
<meta property="og:image:width" content="1200" />
<meta property="og:image:height" content="630" />
<meta name="twitter:title" content="Explore PCM-Net for Zero-Shot Image Captioning">
<meta name="twitter:description"
content="PCM-Net introduces a groundbreaking approach to zero-shot image captioning, blending synthetic image-caption pairs for enhanced accuracy and detail.">
<!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
<meta name="twitter:image" content="static/images/pcm_net_twitter_banner.png">
<meta name="twitter:card" content="summary_large_image">
<!-- Keywords for your paper to be indexed by-->
<meta name="keywords"
content="Zero-shot Image Captioning, CLIP, Attention Mechanism, PCM-Net, Synthetic Dataset, SynthImgCap Dataset, Image-Text Alignment, Encoder-Decoder Frameworks, MSCOCO, Flickr30k">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Unleashing Text-to-Image Diffusion Prior for Zero-Shot Image Captioning</title>
<link rel="icon" type="image/x-icon" href="static/images/favicon.ico">
<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
<link rel="stylesheet" href="static/css/bulma.min.css">
<link rel="stylesheet" href="static/css/bulma-carousel.min.css">
<link rel="stylesheet" href="static/css/bulma-slider.min.css">
<link rel="stylesheet" href="static/css/fontawesome.all.min.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
<link rel="stylesheet" href="static/css/index.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
<script defer src="static/js/fontawesome.all.min.js"></script>
<script src="static/js/bulma-carousel.min.js"></script>
<script src="static/js/bulma-slider.min.js"></script>
<script src="static/js/index.js"></script>
</head>
<body>
<section class="hero">
<div class="hero-body">
<div class="container is-max-desktop">
<div class="columns is-centered">
<div class="column has-text-centered">
<h1 class="title is-1 publication-title">Unleashing Text-to-Image Diffusion Prior for Zero-Shot Image
Captioning</h1>
<div class="is-size-5 publication-authors">
<!-- Paper authors -->
<span class="author-block">
<a href="https://jianjieluo.github.io/" target="_blank">Jianjie Luo</a><sup>1</sup>,</span>
<span class="author-block">
<a href="https://scholar.google.com/citations?user=RKEiP70AAAAJ&hl=en" target="_blank">Jingwen Chen</a><sup>2</sup>,</span>
<span class="author-block">
<a href="http://yehaoli.deepfun.club/" target="_blank">Yehao Li</a><sup>2</sup>,</span>
<span class="author-block">
<a href="https://scholar.google.com/citations?user=2RxXFPoAAAAJ&hl=en" target="_blank">Yingwei Pan</a><sup>2</sup>,</span>
<span class="author-block">
<a href="https://cse.sysu.edu.cn/content/2511" target="_blank">Jianlin Feng</a><sup>1</sup>,</span>
<span class="author-block">
<a href="https://scholar.google.com/citations?user=qnbpG6gAAAAJ&hl=en" target="_blank">Hongyang Chao</a><sup>1</sup>,</span>
<span class="author-block">
<a href="http://tingyao.deepfun.club/" target="_blank">Ting Yao</a><sup>2</sup>
</span>
</div>
<div class="is-size-5 publication-authors">
<span class="author-block"><sup>1</sup>Sun Yat-sen University, <sup>2</sup>HiDream.ai
Inc.<br>ECCV2024</span>
</div>
<!-- PDF link -->
<span class="link-block">
<a href="static/pdfs/ECCV2024_main_CameraReady.pdf" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fas fa-file-pdf"></i>
</span>
<span>PDF</span>
</a>
</span>
<!-- Github link -->
<span class="link-block">
<a href="https://github.com/jianjieluo/PCM-Net" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-github"></i>
</span>
<span>Code</span>
</a>
</span>
<!-- Dataset Link. -->
<span class="link-block">
<a href="#SynthImgCap" class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="far fa-images"></i>
</span>
<span>Dataset</span>
</a>
</span>
</div>
</div>
</div>
</div>
</div>
</div>
</section>
<!-- Teaser video-->
<!-- Paper abstract -->
<section class="section hero is-light">
<div class="container is-max-desktop">
<div class="columns is-centered has-text-centered">
<div class="column is-four-fifths">
<h2 class="title is-3">Abstract</h2>
<div class="content has-text-justified">
<p>
Recently, zero-shot image captioning (ZIC) has gained increasing attention, where only text data is
available for training. The remarkable progress in text-to-image diffusion model presents the potential to
resolve this task by employing synthetic image-caption pairs generated by this pre-trained prior.
Nonetheless, the defective details in the salient regions of the synthetic images introduce semantic
misalignment between the synthetic image and text, leading to compromised results. To address this
challenge, we propose a novel <b>P</b>atch-wise <b>C</b>ross-modal feature <b>M</b>ix-up (PCM) mechanism
to adaptively mitigate the unfaithful contents in a fine-grained manner during training, which can be
integrated into most of encoder-decoder frameworks, introducing our PCM-Net. Specifically, for each input
image, salient visual concepts in the image are first detected considering the image-text similarity in
CLIP space. Next, the patch-wise visual features of the input image are selectively fused with the textual
features of the salient visual concepts, leading to a mixed-up feature map with fewer defective elements.
Finally, a visual-semantic encoder is exploited to refine the derived feature map, which is further
incorporated into the sentence decoder for caption generation. Additionally, to facilitate the model
training with synthetic data, a novel CLIP-weighted cross-entropy loss is devised to prioritize the
high-quality image-text pairs over the low-quality counterparts. Extensive experiments on MSCOCO and
Flickr30k datasets demonstrate the superiority of our PCM-Net compared with state-of-the-art VLMs-based
approaches, and the synthetic dataset <b>SynthImgCap</b> is released for further research in
vision-language representation learning.
</p>
</div>
</div>
</div>
</div>
</section>
<!-- End paper abstract -->
<!-- SynthImgCap Dataset Introduction -->
<section class="hero is-small">
<div class="hero-body">
<div class="container is-max-desktop content">
<h2 class="title is-3" id="SynthImgCap">SynthImgCap Dataset</h2>
<div class="content has-text-justified">
<p>
The SynthImgCap dataset introduces fine-grained, high-quality synthetic image-caption pairs for zero-shot
(text-only) image captioning research, providing a cost-effective alternative to labor-intensive,
human-annotated paired data traditionally used in Vision-and-Language tasks.
It can also serve as a benchmark, inspiring the use of extensive web-scale image-text pairs to create even
larger synthetic image-text pairs for multimodal learning. Such synthetic data mitigates copyright issues
associated with web-crawled datasets, making it more conducive to open-source contributions and academic
development.
</p>
<p>
Utilizing the capabilities of a text-to-image diffusion model, the SynthImgCap dataset generates one
synthetic image for each sentence in the training corpus, resulting in 542,401 and 144,541 synthetic
image-text pairs for the MSCOCO-SD and Flickr30k-SD datasets, respectively. You can access it on Google
Drive or BaiduYun.
</p>
<a href="https://drive.google.com/drive/folders/19ASNWUL9-WaCj2RJTQBP2vxh25VlPWS3?usp=sharing"
class="button is-primary">Google Drive</a>
<a href="https://pan.baidu.com/s/1ntmgvPO9hPBOLS0ecJtfWA" class="button is-primary">BaiduYun (extract code:
`vpkx`)</a>
</div>
</div>
</div>
</section>
<!--End SynthImgCap Dataset -->
<!-- Framework -->
<section class="hero is-small">
<div class="hero-body">
<div class="container is-max-desktop content">
<h2 class="title is-3">Framework</h2>
<img src="./static/images/framework.jpg" class="interpolation-image" alt="PCM-Net Framework." />
</div>
</div>
</section>
<!--Framework -->
<!-- Image carousel -->
<!-- Youtube video -->
<!-- Video carousel -->
<!-- Paper poster -->
<!--BibTex citation -->
<section class="section" id="BibTeX">
<div class="container is-max-desktop content">
<h2 class="title">BibTeX</h2>
<pre><code>@inproceedings{luo2024unleashing,
title = {Unleashing Text-to-Image Diffusion Prior for Zero-Shot Image Captioning},
author = {Luo, Jianjie and Chen, Jingwen and Li, Yehao and Pan, Yingwei and Feng, Jianlin and Chao, Hongyang and Yao, Ting},
booktitle = {European Conference on Computer Vision (ECCV)},
year = {2024}
}</code></pre>
</div>
</section>
<!--End BibTex citation -->
<!-- Map Widget Code -->
<div style="clear: both;"> </div>
<div align="center">
<a href='https://clustrmaps.com/site/1c1ar' title='Visit tracker'><img src='//clustrmaps.com/map_v2.png?cl=cbf0ff&w=a&t=tt&d=4NrtPK0qFvAx6oJNHIeD7RzAuXpoQ77NBRpLbSbOBoQ&co=32aaff'/></a>
</div>
<footer class="footer">
<div class="container">
<div class="columns is-centered">
<div class="column is-8">
<div class="content">
<p>
This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template"
target="_blank">Academic Project Page Template</a> which was adopted from the <a
href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
<br> This website is licensed under a <a rel="license"
href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
Commons Attribution-ShareAlike 4.0 International License</a>.
</p>
</div>
</div>
</div>
</div>
</footer>
<!-- Statcounter tracking code -->
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->
<!-- End of Statcounter Code -->
</body>
</html>