index.html

<!DOCTYPE html>
<html>

<head>
  <meta charset="utf-8">
  <meta name="description" content="EMMA: Your Text-to-Image Diffusion Model Can Secretly Accept Multi-Modal Prompts">
  <meta name="keywords" content="EMMA">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title> EMMA: Your Text-to-Image Diffusion Model Can Secretly Accept Multi-Modal Prompts</title>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/icon.png">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>

  <style>
    .gif-row {
      display: flex;
      justify-content: center;
      /* Center the GIFs horizontally */
      align-items: center;
      /* Center the GIFs vertically if they have different heights */
      gap: 0px;
      /* Space between GIFs */
    }

    .gif-row img {
      max-width: 30%;
      /* Ensure the GIFs are responsive */
    }

    .gif-column {
      display: flex;
      flex-direction: column;
      align-items: center;
    }

    .gif-column img {
      margin: 0px 0;
      max-width: 30%;
      /* Ensure the GIFs are responsive */
    }
  </style>
</head>

<!-- 
[**Yucheng Han**$^{\ast}$](http://tingxueronghua.github.io), [**Chi Zhang**$^{\ast\dag}$](https://icoz69.github.io/), [Xin Chen](https://chenxin.tech/), [Xu Yang](https://cse.seu.edu.cn/2021/1126/c23024a392593/page.htm), [Zhibin Wang](https://openreview.net/profile?id=~Billzb_Wang1)
<br>
[Gang Yu](https://www.skicyyu.org/), [Bin Fu](https://openreview.net/profile?id=~BIN_FU2), [Hanwang Zhang](https://personal.ntu.edu.sg/hanwangzhang/) -->

<body>
  <!-- Yucheng Han, Chi Zhang, Xin Chen, Xu Yang, Billzb Wang, Gang Yu, BIN FU, Hanwang Zhang  -->
  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">EMMA: Your Text-to-Image Diffusion Model Can Secretly Accept Multi-Modal Prompts
            </h1>
            <div class="is-size-5 publication-authors">
              <span class="author-block">
                <a href="https://tingxueronghua.github.io/">Yucheng Han*</a>,</span>
              <span class="author-block">
                <a href="https://wrong.wang/">Rui Wang*&dagger;</a>,</span>
              <span class="author-block">
                <a href="https://icoz69.github.io/">Chi Zhang*&#10022;</a>,</span>
              <span class="author-block">
                  <a href="https://tencentqqgylab.github.io/EMMA">Juntao Hu</a>,</span>
              <span class="author-block">
                  <a href="https://tencentqqgylab.github.io/EMMA">Pei Cheng</a>,</span>
              <span class="author-block">
                  <a href="https://tencentqqgylab.github.io/EMMA">Bin Fu</a>,</span>
                    <span class="author-block">
                <a href="https://personal.ntu.edu.sg/hanwangzhang/">Hanwang Zhang</a></span>
              

            <div class="is-size-5 publication-authors">
              <span class="author-block">Nanyang Technological University, Tencent</span>
            </div>
            <div class="is-size-5 publication-authors">
              <span class="author-block"><sup></sup>(* Equal contributions, &dagger; Project Lead, &#10022 Corresponding Author)</span>
            </div>

            <div class="column has-text-centered">
              <div class="publication-links">
                <!-- PDF Link. -->
                <span class="link-block">
                  <a href="https://arxiv.org/pdf/2406.09162.pdf"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                    </span>
                    <span>Paper</span>
                  </a>
                </span>
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2406.09162"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                  </a>
                </span>
                <!-- Code Link. -->
                <span class="link-block">
                  <a href="https://github.com/TencentQQGYLab/ELLA"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>
                <!-- Huggingface Link. -->
                <span class="link-block">
                  <a href="https://tencentqqgylab.github.io/EMMA"
                  class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="fab fa-github"></i>
                  </span>
                  <span>Huggingface Demo (Coming soon)</span>
                  </a>
              </span>
              </div>
            </div>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>


  <section class="section">
    <div class="container is-max-desktop">
      <!-- Teaser. -->
      <div class="columns is-centered has-text-centered">
        <div class="column">
          <div class="content has-text-justified">
            <div class="center-image">
              <!-- <figure>
                <img src="./static/teaser.png" class="interpolation-image"
                  alt="Interpolate start reference image." />
                <figcaption>
                </figcaption>
              </figure> -->
              <div style="display: flex; justify-content: center; align-items: center;">
                <video id="teaser" controls="" loop="" playsinline="">
                  <source src="./DreamBooth_files/video_teaser_480.mp4" type="video/mp4" codecs="avc1.42E01E, mp4a.40.2">
                </video>
              </div>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>


  <section class="section">
    <div class="container is-max-desktop">
      <!-- Abstract. -->
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Abstract</h2>
          <div class="content has-text-justified">
            <p>
              Recent advancements in image generation have enabled the creation of high-quality images from text conditions.
      However, when facing multi-modal conditions, such as text combined with reference appearances, existing methods
      struggle to balance multiple conditions effectively, typically showing a preference for one modality over others.
      To address this challenge, we introduce EMMA, a novel image generation model accepting multi-modal prompts built
      upon the state-of-the-art text-to-image (T2I) diffusion model, ELLA. EMMA seamlessly incorporates additional
      modalities alongside text to guide image generation through an innovative Multi-modal Feature Connector design,
      which effectively integrates textual and supplementary modal information using a special attention mechanism. By
      freezing all parameters in the original T2I diffusion model and only adjusting some additional layers, we reveal
      an interesting finding that the pre-trained T2I diffusion model can secretly accept multi-modal prompts. This
      interesting property facilitates easy adaptation to different existing frameworks, making EMMA a flexible and
      effective tool for producing personalized and context-aware images and even videos. Additionally, we introduce a
      strategy to assemble learned EMMA modules to produce images conditioned on multiple modalities simultaneously,
      eliminating the need for additional training with mixed multi-modal prompts. Extensive experiments demonstrate the
      effectiveness of EMMA in maintaining high fidelity and detail in generated images, showcasing its potential as a
      robust solution for advanced multi-modal conditional image generation tasks.
            </p>
          </div>
        </div>
      </div>
      <!--/ Abstract. -->
    </div>
  </section>
  

  <section class="section">
    <div class="container is-max-desktop">
      <!-- Abstract. -->
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Method</h2>
          <div class="content has-text-justified">
            <p>
              Our proposed EMMA is built upon the state-of-the-art text-conditioned
      diffusion model ELLA, which trains a transformer-like module, named Perceiver Resampler, to
      connect text embeddings from pre-trained text encoders and pre-trained diffusion models for better text-guided
      image generation.
      ELLA has strong text-to-image generation ability, and our proposed EMMA could merge information from other
      modalities into text features for
      guidance.
            </p>
          </div>
        </div>
      </div>
      <!--/ Abstract. -->
    </div>
  </section>
  

  <section class="section">
    <div class="container is-max-desktop">
      <!-- Teaser. -->
      <div class="columns is-centered has-text-centered">
        <div class="column">
          <div class="content has-text-justified">
            <div class="center-image">
              <figure>
                <img src="./DreamBooth_files/emma_pipeline.png" class="interpolation-image"
                  alt="Interpolate start reference image." />
              </figure>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>

  
  <section class="section">
    <div class="container is-max-desktop">
      <!-- Abstract. -->
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3"></h2>
          <div class="content has-text-justified">
            <p>
              In detail, to control the image generation process by modalities beyond text, EMMA incorporates our proposed
      Assemblable Gated Perceiver Resampler (AGPR), which leverages cross-attention to inject information from
      additional modalities beyond texts. In our design, the AGPR blocks are strategically interleaved with the
      blocks
      of the Perceiver Resampler of ELLA. This arrangement ensures an effective integration of multi-modal
      information.
      During training, we freeze the raw modules of ELLA to maintain the control ability of text conditions.
            </p>
          </div>
        </div>
      </div>
      <!--/ Abstract. -->
    </div>
  </section>

  <section class="section">
    <div class="container is-max-desktop">
      <!-- Teaser. -->
      <div class="columns is-centered has-text-centered">
        <div class="column">
          <div class="content has-text-justified">
            <div class="center-image">
              <figure>
                <img src="./DreamBooth_files/emma_methods.png" class="interpolation-image"
                  alt="Interpolate start reference image." />
              </figure>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>

  <section class="section">
    <div class="container is-max-desktop">
      <!-- Abstract. -->
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3"></h2>
          <div class="content has-text-justified">
            <p>
              Notably, EMMA is inherently designed to handle multi-modal prompts as conditions, allowing for the
      straightforward combination of different multi-modal configurations. This is achieved by the gate mechanism in
      our
      AGPR, which could control the way of injecting information from other modalities into the textual features.
      This
      advantage enables diverse and complex inputs to be synthesized into a unified generation framework without the
      need for additional training.
            </p>
          </div>
        </div>
      </div>
      <!--/ Abstract. -->
    </div>
  </section>

  <section class="section">
    <div class="container is-max-desktop">
      <!-- Teaser. -->
      <div class="columns is-centered has-text-centered">
        <div class="column">
          <div class="content has-text-justified">
            <div class="center-image">
              <figure>
                <img src="./DreamBooth_files/emma_ensemble_methods.png" class="interpolation-image"
                  alt="Interpolate start reference image." />
              </figure>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>

  <section class="section">
    <div class="container is-max-desktop">
      <!-- Abstract. -->
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Image Generation with text + portrait conditions</h2>
          <div class="content has-text-justified">
            <p>Here, we present additional images generated by EMMA under text + portrait conditions. Various portraits, each
              with unique features, adhere to the same prompts, demonstrating our model's excellent control over text
              conditioning and its ability to preserve individual identities.
            </p>
          </div>
        </div>
      </div>
      <!--/ Abstract. -->
    </div>
  </section>
  <section class="section">
    <div class="container is-max-desktop">
      <!-- Teaser. -->
      <div class="columns is-centered has-text-centered">
        <div class="column">
          <div class="content has-text-justified">
            <div class="center-image">
              <figure>
                <img src="./DreamBooth_files/res_main.png" class="interpolation-image"
                  alt="Interpolate start reference image." />
              </figure>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>


  <section class="section">
    <div class="container is-max-desktop">
      <!-- Abstract. -->
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Image Generation with text + portrait conditions using ToonYou</h2>
          <div class="content has-text-justified">
            <p>Given a text prompt and a portrait, our proposed EMMA can integrate with various diffusion models to generate
              images in different styles. Here are the images created using EMMA in conjunction with ToonYou.
            </p>
          </div>
        </div>
      </div>
      <!--/ Abstract. -->
    </div>
  </section>
  <section class="section">
    <div class="container is-max-desktop">
      <!-- Teaser. -->
      <div class="columns is-centered has-text-centered">
        <div class="column">
          <div class="content has-text-justified">
            <div class="center-image">
              <figure>
                <img src="./DreamBooth_files/res_toonyou.png" class="interpolation-image"
                  alt="Interpolate start reference image." />
              </figure>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>


  <section class="section">
    <div class="container is-max-desktop">
      <!-- Abstract. -->
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Image Generation with text + portrait conditions using AnimateDiff</h2>
          <div class="content has-text-justified">
            <p>Given a portrait and a prompt, our proposed EMMA, combined with the AnimateDiff diffusion model, can generate
              images that preserve portrait details while adhering to text instructions.
            </p>
          </div>
        </div>
      </div>
      <!--/ Abstract. -->
    </div>
  </section>
  <section class="section">
    <div class="container is-max-desktop">
      <!-- Teaser. -->
      <div class="columns is-centered has-text-centered">
        <div class="column">
          <div class="content has-text-justified">
            <div class="center-image">
              <div class="gif-row">
                <img src="./DreamBooth_files/woman_blue_dress.gif">
                <img src="./DreamBooth_files/woman_green_dress.gif">
                <img src="./DreamBooth_files/woman_purple_dress.gif">
              </div>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>

  <section class="section">
    <div class="container is-max-desktop">
      <!-- Abstract. -->
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Story Telling</h2>
          <div class="content has-text-justified">
            <p>Images generated by our EMMA with portrait conditions. Two sets of images are generated for two separate stories.
              The first set of images is about a mailing woman chased by a dog. The second set of images is about a man finding
              treasures.
            </p>
          </div>
        </div>
      </div>
      <!--/ Abstract. -->
    </div>
  </section>
  <section class="section">
    <div class="container is-max-desktop">
      <!-- Teaser. -->
      <div class="columns is-centered has-text-centered">
        <div class="column">
          <div class="content has-text-justified">
            <div class="center-image">
              <figure>
                <img src="./DreamBooth_files/story_diffusion.png" class="interpolation-image"
                  alt="Interpolate start reference image." />
              </figure>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>


  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>
@misc{han2024emma,
  title={EMMA: Your Text-to-Image Diffusion Model Can Secretly Accept Multi-Modal Prompts}, 
  author={Yucheng Han and Rui Wang and Chi Zhang and Juntao Hu and Pei Cheng and Bin Fu and Hanwang Zhang},
  year={2024},
  eprint={2406.09162},
  archivePrefix={arXiv},
  primaryClass={id='cs.CV' full_name='Computer Vision and Pattern Recognition' is_active=True alt_name=None in_archive='cs' is_general=False description='Covers image processing, computer vision, pattern recognition, and scene understanding. Roughly includes material in ACM Subject Classes I.2.10, I.4, and I.5.'}
}
      </code></pre>
    </div>
  </section>


  <footer class="footer">
    <div class="container">
      <div class="content has-text-centered">
        <a class="icon-link" href="https://arxiv.org/pdf/2406.09162.pdf">
          <i class="fas fa-file-pdf"></i>
        </a>
        <a class="icon-link" href="https://tencentqqgylab.github.io/EMMA" class="external-link" disabled>
          <i class="fab fa-github"></i>
        </a>
      </div>
      <div class="columns is-centered">
        <div class="column is-8">
          <div class="content">
            <p>
              This website is licensed under a <a rel="license"
                href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
                Commons Attribution-ShareAlike 4.0 International License</a>.
            </p>
            <p>
              This means you are free to borrow the <a href="https://github.com/nerfies/nerfies.github.io">source
                code</a> of this website,
              we just ask that you link back to this page in the footer.
              Please remember to remove the analytics code included in the header of the website which
              you do not want on your website.
            </p>
          </div>
        </div>
      </div>
    </div>
  </footer>

</body>

</html>