index.html

<!DOCTYPE html>
<html>

<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description"
    content="Introducing PCM-Net for zero-shot image captioning by leveraging a novel Patch-wise Cross-modal feature Mix-up mechanism and CLIP-weighted cross-entropy loss, significantly outperforming state-of-the-art methods on MSCOCO and Flickr30k datasets.">
  <meta property="og:title" content="Unleashing Text-to-Image Diffusion Prior for Zero-Shot Image Captioning" />
  <meta property="og:description"
    content="Discover PCM-Net's novel approach to zero-shot image captioning, achieving state-of-the-art results by synthesizing image-caption pairs with fewer defective elements and prioritizing high-quality data." />
  <meta property="og:url" content="https://jianjieluo.github.io/SynthImgCap" />
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/image/pcm_net_banner.png" />
  <meta property="og:image:width" content="1200" />
  <meta property="og:image:height" content="630" />


  <meta name="twitter:title" content="Explore PCM-Net for Zero-Shot Image Captioning">
  <meta name="twitter:description"
    content="PCM-Net introduces a groundbreaking approach to zero-shot image captioning, blending synthetic image-caption pairs for enhanced accuracy and detail.">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/pcm_net_twitter_banner.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords"
    content="Zero-shot Image Captioning, CLIP, Attention Mechanism, PCM-Net, Synthetic Dataset, SynthImgCap Dataset, Image-Text Alignment, Encoder-Decoder Frameworks, MSCOCO, Flickr30k">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>Unleashing Text-to-Image Diffusion Prior for Zero-Shot Image Captioning</title>
  <link rel="icon" type="image/x-icon" href="static/images/favicon.ico">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>

<body>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">Unleashing Text-to-Image Diffusion Prior for Zero-Shot Image
              Captioning</h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a href="https://jianjieluo.github.io/" target="_blank">Jianjie Luo</a><sup>1</sup>,</span>
              <span class="author-block">
                <a href="https://scholar.google.com/citations?user=RKEiP70AAAAJ&hl=en" target="_blank">Jingwen Chen</a><sup>2</sup>,</span>
              <span class="author-block">
                <a href="http://yehaoli.deepfun.club/" target="_blank">Yehao Li</a><sup>2</sup>,</span>
              <span class="author-block">
                <a href="https://scholar.google.com/citations?user=2RxXFPoAAAAJ&hl=en" target="_blank">Yingwei Pan</a><sup>2</sup>,</span>
              <span class="author-block">
                <a href="https://cse.sysu.edu.cn/content/2511" target="_blank">Jianlin Feng</a><sup>1</sup>,</span>
              <span class="author-block">
                <a href="https://scholar.google.com/citations?user=qnbpG6gAAAAJ&hl=en" target="_blank">Hongyang Chao</a><sup>1</sup>,</span>
              <span class="author-block">
                <a href="http://tingyao.deepfun.club/" target="_blank">Ting Yao</a><sup>2</sup>
              </span>
            </div>

            <div class="is-size-5 publication-authors">
              <span class="author-block"><sup>1</sup>Sun Yat-sen University, <sup>2</sup>HiDream.ai
                Inc.<br>ECCV2024</span>
            </div>

            <!-- PDF link -->
            <span class="link-block">
                <a href="static/pdfs/ECCV2024_main_CameraReady.pdf" target="_blank"
                class="external-link button is-normal is-rounded is-dark">
                <span class="icon">
                  <i class="fas fa-file-pdf"></i>
                </span>
                <span>PDF</span>
              </a>
            </span>

            <!-- Github link -->
            <span class="link-block">
              <a href="https://github.com/jianjieluo/PCM-Net" target="_blank"
                class="external-link button is-normal is-rounded is-dark">
                <span class="icon">
                  <i class="fab fa-github"></i>
                </span>
                <span>Code</span>
              </a>
            </span>
            <!-- Dataset Link. -->
            <span class="link-block">
              <a href="#SynthImgCap" class="external-link button is-normal is-rounded is-dark">
                <span class="icon">
                  <i class="far fa-images"></i>
                </span>
                <span>Dataset</span>
              </a>
            </span>

          </div>
        </div>
      </div>
    </div>
    </div>
    </div>
  </section>

  <!-- Teaser video-->

  <!-- Paper abstract -->
  <section class="section hero is-light">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Abstract</h2>
          <div class="content has-text-justified">
            <p>
              Recently, zero-shot image captioning (ZIC) has gained increasing attention, where only text data is
              available for training. The remarkable progress in text-to-image diffusion model presents the potential to
              resolve this task by employing synthetic image-caption pairs generated by this pre-trained prior.
              Nonetheless, the defective details in the salient regions of the synthetic images introduce semantic
              misalignment between the synthetic image and text, leading to compromised results. To address this
              challenge, we propose a novel <b>P</b>atch-wise <b>C</b>ross-modal feature <b>M</b>ix-up (PCM) mechanism
              to adaptively mitigate the unfaithful contents in a fine-grained manner during training, which can be
              integrated into most of encoder-decoder frameworks, introducing our PCM-Net. Specifically, for each input
              image, salient visual concepts in the image are first detected considering the image-text similarity in
              CLIP space. Next, the patch-wise visual features of the input image are selectively fused with the textual
              features of the salient visual concepts, leading to a mixed-up feature map with fewer defective elements.
              Finally, a visual-semantic encoder is exploited to refine the derived feature map, which is further
              incorporated into the sentence decoder for caption generation. Additionally, to facilitate the model
              training with synthetic data, a novel CLIP-weighted cross-entropy loss is devised to prioritize the
              high-quality image-text pairs over the low-quality counterparts. Extensive experiments on MSCOCO and
              Flickr30k datasets demonstrate the superiority of our PCM-Net compared with state-of-the-art VLMs-based
              approaches, and the synthetic dataset <b>SynthImgCap</b> is released for further research in
              vision-language representation learning.
            </p>
          </div>
        </div>
      </div>
    </div>
  </section>
  <!-- End paper abstract -->


  <!-- SynthImgCap Dataset Introduction -->
  <section class="hero is-small">
    <div class="hero-body">
      <div class="container is-max-desktop content">
        <h2 class="title is-3" id="SynthImgCap">SynthImgCap Dataset</h2>

        <div class="content has-text-justified">
          <p>
            The SynthImgCap dataset introduces fine-grained, high-quality synthetic image-caption pairs for zero-shot
            (text-only) image captioning research, providing a cost-effective alternative to labor-intensive,
            human-annotated paired data traditionally used in Vision-and-Language tasks.

            It can also serve as a benchmark, inspiring the use of extensive web-scale image-text pairs to create even
            larger synthetic image-text pairs for multimodal learning. Such synthetic data mitigates copyright issues
            associated with web-crawled datasets, making it more conducive to open-source contributions and academic
            development.
          </p>
          <p>
            Utilizing the capabilities of a text-to-image diffusion model, the SynthImgCap dataset generates one
            synthetic image for each sentence in the training corpus, resulting in 542,401 and 144,541 synthetic
            image-text pairs for the MSCOCO-SD and Flickr30k-SD datasets, respectively. You can access it on Google
            Drive or BaiduYun.
          </p>
          <a href="https://drive.google.com/drive/folders/19ASNWUL9-WaCj2RJTQBP2vxh25VlPWS3?usp=sharing"
            class="button is-primary">Google Drive</a>
          <a href="https://pan.baidu.com/s/1ntmgvPO9hPBOLS0ecJtfWA" class="button is-primary">BaiduYun (extract code:
            `vpkx`)</a>
        </div>
      </div>
    </div>
  </section>
  <!--End SynthImgCap Dataset -->


  <!-- Framework -->
  <section class="hero is-small">
    <div class="hero-body">
      <div class="container is-max-desktop content">
        <h2 class="title is-3">Framework</h2>
        <img src="./static/images/framework.jpg" class="interpolation-image" alt="PCM-Net Framework." />
      </div>
    </div>
  </section>
  <!--Framework -->


  <!-- Image carousel -->


  <!-- Youtube video -->


  <!-- Video carousel -->


  <!-- Paper poster -->


  <!--BibTex citation -->
  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>@inproceedings{luo2024unleashing,
        title = {Unleashing Text-to-Image Diffusion Prior for Zero-Shot Image Captioning},
        author = {Luo, Jianjie and Chen, Jingwen and Li, Yehao and Pan, Yingwei and Feng, Jianlin and Chao, Hongyang and Yao, Ting},
        booktitle = {European Conference on Computer Vision (ECCV)},
        year = {2024}
}</code></pre>
    </div>
  </section>
  <!--End BibTex citation -->

  <!-- Map Widget Code -->
  <div style="clear: both;">&nbsp;</div>
  <div align="center">
    <a href='https://clustrmaps.com/site/1c1ar'  title='Visit tracker'><img src='//clustrmaps.com/map_v2.png?cl=cbf0ff&w=a&t=tt&d=4NrtPK0qFvAx6oJNHIeD7RzAuXpoQ77NBRpLbSbOBoQ&co=32aaff'/></a>
  </div>

  <footer class="footer">
    <div class="container">
      <div class="columns is-centered">
        <div class="column is-8">
          <div class="content">

            <p>
              This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template"
                target="_blank">Academic Project Page Template</a> which was adopted from the <a
                href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
              <br> This website is licensed under a <a rel="license"
                href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
                Commons Attribution-ShareAlike 4.0 International License</a>.
            </p>

          </div>
        </div>
      </div>
    </div>
  </footer>

  <!-- Statcounter tracking code -->

  <!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

  <!-- End of Statcounter Code -->

</body>

</html>