From 08a182835fd654af397e0e25a7c7829470ca7b72 Mon Sep 17 00:00:00 2001
From: okotaku <to78314910@gmail.com>
Date: Fri, 8 Sep 2023 12:14:35 +0900
Subject: [PATCH 001/139] add ip-adapter

---
 src/diffusers/pipelines/__init__.py           |   1 +
 .../pipelines/ip_adapter/__init__.py          |   2 +
 .../ip_adapter/attention_processor.py         | 406 ++++++++++++++++++
 .../pipelines/ip_adapter/image_projection.py  |  24 ++
 .../ip_adapter/pipeline_ip_adapter.py         | 234 ++++++++++
 5 files changed, 667 insertions(+)
 create mode 100644 src/diffusers/pipelines/ip_adapter/__init__.py
 create mode 100644 src/diffusers/pipelines/ip_adapter/attention_processor.py
 create mode 100644 src/diffusers/pipelines/ip_adapter/image_projection.py
 create mode 100644 src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 28f42ce9fae9..6677a6e69179 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -63,6 +63,7 @@
         IFPipeline,
         IFSuperResolutionPipeline,
     )
+    from .ip_adapter import IPAdapterPipeline
     from .kandinsky import (
         KandinskyCombinedPipeline,
         KandinskyImg2ImgCombinedPipeline,
diff --git a/src/diffusers/pipelines/ip_adapter/__init__.py b/src/diffusers/pipelines/ip_adapter/__init__.py
new file mode 100644
index 000000000000..b3f2b3953782
--- /dev/null
+++ b/src/diffusers/pipelines/ip_adapter/__init__.py
@@ -0,0 +1,2 @@
+from .image_projection import ImageProjModel
+from .pipeline_ip_adapter import IPAdapterPipeline
diff --git a/src/diffusers/pipelines/ip_adapter/attention_processor.py b/src/diffusers/pipelines/ip_adapter/attention_processor.py
new file mode 100644
index 000000000000..df12e387eebc
--- /dev/null
+++ b/src/diffusers/pipelines/ip_adapter/attention_processor.py
@@ -0,0 +1,406 @@
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class IPAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        text_context_len (`int`, defaults to 77):
+            The context length of the text features.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, text_context_len=77, scale=1.0):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.text_context_len = text_context_len
+        self.scale = scale
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale=1.0,
+    ):
+        if scale != 1.0:
+            warnings.warn("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        # split hidden states
+        encoder_hidden_states, ip_hidden_states = (
+            encoder_hidden_states[:, : self.text_context_len, :],
+            encoder_hidden_states[:, self.text_context_len :, :],
+        )
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+
+        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+        ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class IPAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for IP-Adapater for PyTorch 2.0.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        text_context_len (`int`, defaults to 77):
+            The context length of the text features.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, text_context_len=77, scale=1.0):
+        super().__init__()
+
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.text_context_len = text_context_len
+        self.scale = scale
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale=1.0,
+    ):
+        if scale != 1.0:
+            warnings.warn("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        # split hidden states
+        encoder_hidden_states, ip_hidden_states = (
+            encoder_hidden_states[:, : self.text_context_len, :],
+            encoder_hidden_states[:, self.text_context_len :, :],
+        )
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        ip_hidden_states = F.scaled_dot_product_attention(
+            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+## for controlnet
+class CNAttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+
+    def __init__(self, text_context_len=77):
+        self.text_context_len = text_context_len
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale=1.0,
+    ):
+        if scale != 1.0:
+            warnings.warn("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = encoder_hidden_states[:, : self.text_context_len]  # only use text
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class CNAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def __init__(self, text_context_len=77):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.text_context_len = text_context_len
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale=1.0,
+    ):
+        if scale != 1.0:
+            warnings.warn("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = encoder_hidden_states[:, : self.text_context_len]
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
diff --git a/src/diffusers/pipelines/ip_adapter/image_projection.py b/src/diffusers/pipelines/ip_adapter/image_projection.py
new file mode 100644
index 000000000000..c609dff16a73
--- /dev/null
+++ b/src/diffusers/pipelines/ip_adapter/image_projection.py
@@ -0,0 +1,24 @@
+from torch import nn
+
+from ...configuration_utils import ConfigMixin
+from ...models.modeling_utils import ModelMixin
+
+
+class ImageProjModel(ModelMixin, ConfigMixin):
+    """Projection Model"""
+
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
+        super().__init__()
+
+        self.cross_attention_dim = cross_attention_dim
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.proj = nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+        self.norm = nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, image_embeds):
+        embeds = image_embeds
+        clip_extra_context_tokens = self.proj(embeds).reshape(
+            -1, self.clip_extra_context_tokens, self.cross_attention_dim
+        )
+        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+        return clip_extra_context_tokens
diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
new file mode 100644
index 000000000000..d87c659ff269
--- /dev/null
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -0,0 +1,234 @@
+from typing import Any, Dict, List, Optional, Union
+
+import PIL
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from ...models.attention_processor import AttnProcessor, AttnProcessor2_0
+from ...utils import (
+    DIFFUSERS_CACHE,
+    HF_HUB_OFFLINE,
+    _get_model_file,
+)
+from ..pipeline_utils import DiffusionPipeline
+from .attention_processor import CNAttnProcessor, CNAttnProcessor2_0, IPAttnProcessor, IPAttnProcessor2_0
+from .image_projection import ImageProjModel
+
+
+class IPAdapterPipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        pipeline: DiffusionPipeline,
+        image_projection: ImageProjModel,
+        image_encoder: CLIPVisionModelWithProjection,
+        image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            pipeline=pipeline,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+            image_projection=image_projection,
+        )
+        self._set_ip_adapter()
+
+    def _set_ip_adapter(self):
+        unet = self.pipeline.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_processor_class = (
+                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                )
+                attn_procs[name] = attn_processor_class()
+            else:
+                attn_processor_class = (
+                    IPAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAttnProcessor
+                )
+                attn_procs[name] = attn_processor_class(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
+                )
+        unet.set_attn_processor(attn_procs)
+        if hasattr(self.pipeline, "controlnet"):
+            attn_processor_class = (
+                CNAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else CNAttnProcessor
+            )
+            self.pipeline.controlnet.set_attn_processor(attn_processor_class())
+
+    def load_ip_adapter(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        """
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+        """
+        # Load the main state dict first which has the LoRA layers for either of
+        # UNet and text encoder or both.
+        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+
+        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+            model_file = _get_model_file(
+                pretrained_model_name_or_path_or_dict,
+                weights_name=weight_name,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+            )
+            state_dict = torch.load(model_file, map_location="cpu")
+        else:
+            state_dict = pretrained_model_name_or_path_or_dict
+        ip_layers = torch.nn.ModuleList(
+            [
+                module if isinstance(module, nn.Module) else nn.Identity()
+                for module in self.pipeline.unet.attn_processors.values()
+            ]
+        )
+        ip_layers.load_state_dict(state_dict["ip_adapter"])
+
+    def set_scale(self, scale):
+        for attn_processor in self.pipeline.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                attn_processor.scale = scale
+
+    def _encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        (image_embeddings,) = self.image_encoder(image).image_embeds
+        image_prompt_embeds = self.image_projection(image_embeddings)
+        uncond_image_prompt_embeds = self.image_projection(torch.zeros_like(image_embeddings))
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        return image_prompt_embeds, uncond_image_prompt_embeds
+
+    def to(self, *args, **kwargs):
+        super(IPAdapterPipeline, self).to(*args, **kwargs)
+        self.pipeline.to(*args, **kwargs)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        *args,
+        example_image: Union[torch.FloatTensor, PIL.Image.Image],
+        prompt: Union[str, List[str]] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        ip_adapter_scale: float = 1.0,
+        **kwargs,
+    ):
+        # 0. Set IP Adapter scale
+        self.set_scale(ip_adapter_scale)
+
+        # 1. Define call parameters
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 2. Encode input image
+        image_embeddings, uncond_image_embeddings = self._encode_image(example_image, device, num_images_per_prompt)
+
+        # 3. Encode prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.pipeline.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+        prompt_embeds = torch.cat([prompt_embeds, image_embeddings], dim=1)
+        negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_embeddings], dim=1)
+
+        return self.pipeline(
+            guidance_scale=guidance_scale,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            num_images_per_prompt=num_images_per_prompt,
+            cross_attention_kwargs=cross_attention_kwargs,
+            **kwargs,
+        )

From c4646f876ad881e5dd733ae4b95dcdb6edc15cb0 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 12:51:22 +0530
Subject: [PATCH 002/139] modularize.

---
 .../ip_adapter/attention_processor.py         |  39 +-
 .../pipelines/ip_adapter/image_projection.py  |  19 +-
 .../ip_adapter/pipeline_ip_adapter.py         | 413 ++++++++++++++++--
 3 files changed, 426 insertions(+), 45 deletions(-)

diff --git a/src/diffusers/pipelines/ip_adapter/attention_processor.py b/src/diffusers/pipelines/ip_adapter/attention_processor.py
index df12e387eebc..cdff79694e9f 100644
--- a/src/diffusers/pipelines/ip_adapter/attention_processor.py
+++ b/src/diffusers/pipelines/ip_adapter/attention_processor.py
@@ -1,13 +1,31 @@
-import warnings
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
 
 class IPAttnProcessor(nn.Module):
     r"""
     Attention processor for IP-Adapater.
+
     Args:
         hidden_size (`int`):
             The hidden size of the attention layer.
@@ -40,7 +58,7 @@ def __call__(
         scale=1.0,
     ):
         if scale != 1.0:
-            warnings.warn("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
+            logger.warning("`scale` of IPAttnProcessor should be set with `IPAdapterPipeline.set_scale`.")
         residual = hidden_states
 
         if attn.spatial_norm is not None:
@@ -116,6 +134,7 @@ def __call__(
 class IPAttnProcessor2_0(torch.nn.Module):
     r"""
     Attention processor for IP-Adapater for PyTorch 2.0.
+
     Args:
         hidden_size (`int`):
             The hidden size of the attention layer.
@@ -131,7 +150,9 @@ def __init__(self, hidden_size, cross_attention_dim=None, text_context_len=77, s
         super().__init__()
 
         if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+            raise ImportError(
+                f"{self.__class__.__name__} requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
 
         self.hidden_size = hidden_size
         self.cross_attention_dim = cross_attention_dim
@@ -151,7 +172,7 @@ def __call__(
         scale=1.0,
     ):
         if scale != 1.0:
-            warnings.warn("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
+            logger.warning("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
         residual = hidden_states
 
         if attn.spatial_norm is not None:
@@ -243,7 +264,7 @@ def __call__(
         return hidden_states
 
 
-## for controlnet
+## for ControlNet
 class CNAttnProcessor:
     r"""
     Default processor for performing attention-related computations.
@@ -262,7 +283,7 @@ def __call__(
         scale=1.0,
     ):
         if scale != 1.0:
-            warnings.warn("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
+            logger.warning("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
         residual = hidden_states
 
         if attn.spatial_norm is not None:
@@ -324,7 +345,9 @@ class CNAttnProcessor2_0:
 
     def __init__(self, text_context_len=77):
         if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+            raise ImportError(
+                f"{self.__class__.__name__} requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
         self.text_context_len = text_context_len
 
     def __call__(
@@ -337,7 +360,7 @@ def __call__(
         scale=1.0,
     ):
         if scale != 1.0:
-            warnings.warn("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
+            logger.warning("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
         residual = hidden_states
 
         if attn.spatial_norm is not None:
diff --git a/src/diffusers/pipelines/ip_adapter/image_projection.py b/src/diffusers/pipelines/ip_adapter/image_projection.py
index c609dff16a73..e46142f5716f 100644
--- a/src/diffusers/pipelines/ip_adapter/image_projection.py
+++ b/src/diffusers/pipelines/ip_adapter/image_projection.py
@@ -1,11 +1,26 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 from torch import nn
 
 from ...configuration_utils import ConfigMixin
 from ...models.modeling_utils import ModelMixin
 
 
-class ImageProjModel(ModelMixin, ConfigMixin):
-    """Projection Model"""
+class ImageProjectionModel(ModelMixin, ConfigMixin):
+    """Image Projection Model."""
 
     def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
         super().__init__()
diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
index d87c659ff269..3087c0e26e59 100644
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -1,37 +1,79 @@
-from typing import Any, Dict, List, Optional, Union
+# Copyright 2023 IP Adapter Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import PIL
 import torch
 import torch.nn.functional as F
 from torch import nn
-from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import AttnProcessor, AttnProcessor2_0
-from ...utils import (
-    DIFFUSERS_CACHE,
-    HF_HUB_OFFLINE,
-    _get_model_file,
-)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import DIFFUSERS_CACHE, HF_HUB_OFFLINE, _get_model_file, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from .attention_processor import CNAttnProcessor, CNAttnProcessor2_0, IPAttnProcessor, IPAttnProcessor2_0
-from .image_projection import ImageProjModel
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from .attention_processor import IPAttnProcessor, IPAttnProcessor2_0
+from .image_projection import ImageProjectionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class IPAdapterPipeline(DiffusionPipeline):
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class StableDiffusionIPAdapterPipeline(DiffusionPipeline):
     def __init__(
         self,
-        pipeline: DiffusionPipeline,
-        image_projection: ImageProjModel,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        image_projection: ImageProjectionModel,
         image_encoder: CLIPVisionModelWithProjection,
         image_processor: CLIPImageProcessor,
+        scheduler: KarrasDiffusionSchedulers,
     ):
         super().__init__()
 
         self.register_modules(
-            pipeline=pipeline,
+            vae=vae,
+            unet=unet,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
             image_encoder=image_encoder,
             image_processor=image_processor,
             image_projection=image_projection,
+            scheduler=scheduler,
         )
         self._set_ip_adapter()
 
@@ -61,11 +103,44 @@ def _set_ip_adapter(self):
                     hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
                 )
         unet.set_attn_processor(attn_procs)
-        if hasattr(self.pipeline, "controlnet"):
-            attn_processor_class = (
-                CNAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else CNAttnProcessor
-            )
-            self.pipeline.controlnet.set_attn_processor(attn_processor_class())
+
+        # TODO: create a separate pipeline for this: `StableDiffusionControlNetIPAdapterPipeline`.
+        # if hasattr(self.pipeline, "controlnet"):
+        #     attn_processor_class = (
+        #         CNAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else CNAttnProcessor
+        #     )
+        #     self.pipeline.controlnet.set_attn_processor(attn_processor_class())
+
+        # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
 
     def load_ip_adapter(
         self,
@@ -152,10 +227,170 @@ def load_ip_adapter(
 
     def set_scale(self, scale):
         for attn_processor in self.pipeline.unet.attn_processors.values():
-            if isinstance(attn_processor, IPAttnProcessor):
+            if isinstance(attn_processor, (IPAttnProcessor, IPAttnProcessor2_0)):
                 attn_processor.scale = scale
 
-    def _encode_image(self, image, device, num_images_per_prompt):
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def encode_image(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
 
         if not isinstance(image, torch.Tensor):
@@ -175,43 +410,75 @@ def _encode_image(self, image, device, num_images_per_prompt):
 
         return image_prompt_embeds, uncond_image_prompt_embeds
 
-    def to(self, *args, **kwargs):
-        super(IPAdapterPipeline, self).to(*args, **kwargs)
-        self.pipeline.to(*args, **kwargs)
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
 
     @torch.no_grad()
     def __call__(
         self,
-        *args,
-        example_image: Union[torch.FloatTensor, PIL.Image.Image],
         prompt: Union[str, List[str]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
+        guidance_rescale: float = 0.0,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: int = 1,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         ip_adapter_scale: float = 1.0,
-        **kwargs,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
     ):
         # 0. Set IP Adapter scale
         self.set_scale(ip_adapter_scale)
 
+        # 1. Check inputs and raise error if needed.
+        # TODO
+
         # 1. Define call parameters
         device = self._execution_device
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 2. Encode input image
-        image_embeddings, uncond_image_embeddings = self._encode_image(example_image, device, num_images_per_prompt)
+        image_embeddings, uncond_image_embeddings = self.encode_image(image, device, num_images_per_prompt)
 
         # 3. Encode prompt
         text_encoder_lora_scale = (
             cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
         )
-        prompt_embeds, negative_prompt_embeds = self.pipeline.encode_prompt(
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
             prompt,
             device,
             num_images_per_prompt,
@@ -224,11 +491,87 @@ def __call__(
         prompt_embeds = torch.cat([prompt_embeds, image_embeddings], dim=1)
         negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_embeddings], dim=1)
 
-        return self.pipeline(
-            guidance_scale=guidance_scale,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            num_images_per_prompt=num_images_per_prompt,
-            cross_attention_kwargs=cross_attention_kwargs,
-            **kwargs,
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
         )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            has_nsfw_concept = None
+            # image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # # Offload last model to CPU
+        # if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        #     self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

From f3755d490566f3ffe66ba0818915e2da48cdb954 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 13:00:08 +0530
Subject: [PATCH 003/139] add to inits.

---
 src/diffusers/__init__.py                      | 1 +
 src/diffusers/pipelines/__init__.py            | 2 +-
 src/diffusers/pipelines/ip_adapter/__init__.py | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index d72c671671c1..aa5ca257dc71 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -80,6 +80,7 @@
         PNDMPipeline,
         RePaintPipeline,
         ScoreSdeVePipeline,
+        StableDiffusionIPAdapterPipeline,
     )
     from .schedulers import (
         CMStochasticIterativeScheduler,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 6677a6e69179..631816e6382d 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -63,7 +63,7 @@
         IFPipeline,
         IFSuperResolutionPipeline,
     )
-    from .ip_adapter import IPAdapterPipeline
+    from .ip_adapter import StableDiffusionIPAdapterPipeline
     from .kandinsky import (
         KandinskyCombinedPipeline,
         KandinskyImg2ImgCombinedPipeline,
diff --git a/src/diffusers/pipelines/ip_adapter/__init__.py b/src/diffusers/pipelines/ip_adapter/__init__.py
index b3f2b3953782..389d0cb4c0c2 100644
--- a/src/diffusers/pipelines/ip_adapter/__init__.py
+++ b/src/diffusers/pipelines/ip_adapter/__init__.py
@@ -1,2 +1,2 @@
-from .image_projection import ImageProjModel
-from .pipeline_ip_adapter import IPAdapterPipeline
+from .image_projection import ImageProjectionModel
+from .pipeline_ip_adapter import StableDiffusionIPAdapterPipeline

From 5887af07e70590ae723a6516160257b1bfcb31e8 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 14:07:24 +0530
Subject: [PATCH 004/139] fix

---
 src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
index 3087c0e26e59..4dc0870c21bc 100644
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -78,7 +78,7 @@ def __init__(
         self._set_ip_adapter()
 
     def _set_ip_adapter(self):
-        unet = self.pipeline.unet
+        unet = self.unet
         attn_procs = {}
         for name in unet.attn_processors.keys():
             cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
@@ -220,13 +220,13 @@ def load_ip_adapter(
         ip_layers = torch.nn.ModuleList(
             [
                 module if isinstance(module, nn.Module) else nn.Identity()
-                for module in self.pipeline.unet.attn_processors.values()
+                for module in self.unet.attn_processors.values()
             ]
         )
         ip_layers.load_state_dict(state_dict["ip_adapter"])
 
     def set_scale(self, scale):
-        for attn_processor in self.pipeline.unet.attn_processors.values():
+        for attn_processor in self.unet.attn_processors.values():
             if isinstance(attn_processor, (IPAttnProcessor, IPAttnProcessor2_0)):
                 attn_processor.scale = scale
 

From f9aaa54aa6aa07a9da0b2ca0f91c363c0f596bd5 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 14:14:13 +0530
Subject: [PATCH 005/139] fix

---
 src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
index 4dc0870c21bc..facee0cd9965 100644
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -75,6 +75,7 @@ def __init__(
             image_projection=image_projection,
             scheduler=scheduler,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self._set_ip_adapter()
 
     def _set_ip_adapter(self):

From a45292b4c92671c85271f01f9c23c0c38ce73b23 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 14:16:10 +0530
Subject: [PATCH 006/139] fix

---
 src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
index facee0cd9965..24df788a958c 100644
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -460,6 +460,9 @@ def __call__(
         # TODO
 
         # 1. Define call parameters
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
         device = self._execution_device
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1

From 023c2b7c20ff198c09fb09e1560d27c8fbe94ee5 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 14:17:43 +0530
Subject: [PATCH 007/139] fix

---
 .../ip_adapter/pipeline_ip_adapter.py         | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
index 24df788a958c..e4793a42ea2c 100644
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -17,6 +17,7 @@
 import PIL
 import torch
 import torch.nn.functional as F
+import inspect
 from torch import nn
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
@@ -231,6 +232,24 @@ def set_scale(self, scale):
             if isinstance(attn_processor, (IPAttnProcessor, IPAttnProcessor2_0)):
                 attn_processor.scale = scale
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
     def encode_prompt(
         self,

From 8fe3064c5f9f51fc786bd23942d7f9910598a53b Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 14:17:58 +0530
Subject: [PATCH 008/139] fix

---
 src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
index e4793a42ea2c..d851f7e4abd5 100644
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import inspect
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import PIL
 import torch
 import torch.nn.functional as F
-import inspect
 from torch import nn
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 

From dded7c41ba40762097092638b5d30c00b7c2b1bf Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 14:27:09 +0530
Subject: [PATCH 009/139] fix

---
 .../pipelines/ip_adapter/pipeline_ip_adapter.py        | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
index d851f7e4abd5..102b858db433 100644
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -15,12 +15,12 @@
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Union
 
-import PIL
 import torch
 import torch.nn.functional as F
 from torch import nn
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import AttnProcessor, AttnProcessor2_0
@@ -61,7 +61,7 @@ def __init__(
         unet: UNet2DConditionModel,
         image_projection: ImageProjectionModel,
         image_encoder: CLIPVisionModelWithProjection,
-        image_processor: CLIPImageProcessor,
+        ip_adapter_image_processor: CLIPImageProcessor,
         scheduler: KarrasDiffusionSchedulers,
     ):
         super().__init__()
@@ -72,11 +72,12 @@ def __init__(
             tokenizer=tokenizer,
             text_encoder=text_encoder,
             image_encoder=image_encoder,
-            image_processor=image_processor,
+            ip_adapter_image_processor=ip_adapter_image_processor,
             image_projection=image_projection,
             scheduler=scheduler,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
         self._set_ip_adapter()
 
     def _set_ip_adapter(self):
@@ -452,7 +453,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        image: PipelineImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 50,
@@ -591,6 +592,7 @@ def __call__(
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         # # Offload last model to CPU
+        # TODO
         # if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
         #     self.final_offload_hook.offload()
 

From 651302b3521093223b38be432887c108561217e5 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 14:32:06 +0530
Subject: [PATCH 010/139] fix

---
 src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
index 102b858db433..ac08eb0d8f13 100644
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -415,7 +415,7 @@ def encode_image(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
 
         if not isinstance(image, torch.Tensor):
-            image = self.image_processor(image, return_tensors="pt").pixel_values
+            image = self.ip_adapter_image_processor(image, return_tensors="pt").pixel_values
 
         image = image.to(device=device, dtype=dtype)
         (image_embeddings,) = self.image_encoder(image).image_embeds

From f10eb255db898c33e3328e4ab5c28ad7e0d2f9b2 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 14:50:26 +0530
Subject: [PATCH 011/139] fix

---
 src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
index ac08eb0d8f13..18a6c934c637 100644
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -63,6 +63,7 @@ def __init__(
         image_encoder: CLIPVisionModelWithProjection,
         ip_adapter_image_processor: CLIPImageProcessor,
         scheduler: KarrasDiffusionSchedulers,
+        _initialize_ip_adapter_modules: bool = True
     ):
         super().__init__()
 
@@ -78,7 +79,9 @@ def __init__(
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
-        self._set_ip_adapter()
+        
+        if _initialize_ip_adapter_modules:
+            self._set_ip_adapter()
 
     def _set_ip_adapter(self):
         unet = self.unet

From f051c9ebaf9f67159ba73ba55a37658e2fd8c735 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 15:05:48 +0530
Subject: [PATCH 012/139] fix

---
 src/diffusers/pipelines/ip_adapter/image_projection.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/ip_adapter/image_projection.py b/src/diffusers/pipelines/ip_adapter/image_projection.py
index e46142f5716f..37077bf042d1 100644
--- a/src/diffusers/pipelines/ip_adapter/image_projection.py
+++ b/src/diffusers/pipelines/ip_adapter/image_projection.py
@@ -15,13 +15,14 @@
 
 from torch import nn
 
-from ...configuration_utils import ConfigMixin
+from ...configuration_utils import ConfigMixin, register_to_config
 from ...models.modeling_utils import ModelMixin
 
 
 class ImageProjectionModel(ModelMixin, ConfigMixin):
     """Image Projection Model."""
 
+    @register_to_config
     def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
         super().__init__()
 

From 6031383d805bc8eda3cc30bc9afcf7ed14f04fda Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 15:11:04 +0530
Subject: [PATCH 013/139] device placement

---
 src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
index 18a6c934c637..87105c088ea4 100644
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -228,7 +228,7 @@ def load_ip_adapter(
                 module if isinstance(module, nn.Module) else nn.Identity()
                 for module in self.unet.attn_processors.values()
             ]
-        )
+        ).to(dtype=self.unet.dtype, device=self.unet.device)
         ip_layers.load_state_dict(state_dict["ip_adapter"])
 
     def set_scale(self, scale):

From 95e38ac87a8091a06a8d76ba3429193ba2fe1e30 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 15:19:34 +0530
Subject: [PATCH 014/139] device placement

---
 src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
index 87105c088ea4..482ba8349894 100644
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -229,6 +229,7 @@ def load_ip_adapter(
                 for module in self.unet.attn_processors.values()
             ]
         ).to(dtype=self.unet.dtype, device=self.unet.device)
+        print(f"UNet dtype: {self.unet.dtype} UNet device: {self.unet.device}")
         ip_layers.load_state_dict(state_dict["ip_adapter"])
 
     def set_scale(self, scale):

From 3d696884fbb4df2fb30505726ba844bc5289aa0b Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 2 Nov 2023 15:25:08 +0530
Subject: [PATCH 015/139] device placement fix.

---
 src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
index 482ba8349894..9f6893ae7ae6 100644
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -228,9 +228,9 @@ def load_ip_adapter(
                 module if isinstance(module, nn.Module) else nn.Identity()
                 for module in self.unet.attn_processors.values()
             ]
-        ).to(dtype=self.unet.dtype, device=self.unet.device)
-        print(f"UNet dtype: {self.unet.dtype} UNet device: {self.unet.device}")
+        )
         ip_layers.load_state_dict(state_dict["ip_adapter"])
+        ip_layers.to(device=self.unet.device, dtype=self.unet.dtype)
 
     def set_scale(self, scale):
         for attn_processor in self.unet.attn_processors.values():

From 351180f7d0e11dca87ad58c9344c56df257dc51e Mon Sep 17 00:00:00 2001
From: okotaku <to78314910@gmail.com>
Date: Fri, 3 Nov 2023 04:29:12 +0000
Subject: [PATCH 016/139] fix import

---
 src/diffusers/__init__.py           | 3 ++-
 src/diffusers/pipelines/__init__.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 5bcf2255f403..46dcf1757c39 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -122,7 +122,6 @@
             "PNDMPipeline",
             "RePaintPipeline",
             "ScoreSdeVePipeline",
-            "StableDiffusionIPAdapterPipeline",
         ]
     )
     _import_structure["schedulers"].extend(
@@ -214,6 +213,7 @@
             "IFPipeline",
             "IFSuperResolutionPipeline",
             "ImageTextPipelineOutput",
+            "StableDiffusionIPAdapterPipeline",
             "KandinskyCombinedPipeline",
             "KandinskyImg2ImgCombinedPipeline",
             "KandinskyImg2ImgPipeline",
@@ -558,6 +558,7 @@
             IFPipeline,
             IFSuperResolutionPipeline,
             ImageTextPipelineOutput,
+            StableDiffusionIPAdapterPipeline,
             KandinskyCombinedPipeline,
             KandinskyImg2ImgCombinedPipeline,
             KandinskyImg2ImgPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index c48cb0cf9d25..08189f04e984 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -319,6 +319,7 @@
             IFPipeline,
             IFSuperResolutionPipeline,
         )
+        from .ip_adapter import StableDiffusionIPAdapterPipeline
         from .kandinsky import (
             KandinskyCombinedPipeline,
             KandinskyImg2ImgCombinedPipeline,

From 2e83d6c1b101efe27af941d7229280d837dd7425 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 3 Nov 2023 13:57:53 +0530
Subject: [PATCH 017/139] composable ip adapter module

---
 src/diffusers/__init__.py                     |   4 +-
 src/diffusers/models/attention_processor.py   | 408 +++++++++++++++++
 src/diffusers/pipelines/__init__.py           |   4 +-
 .../ip_adapter/attention_processor.py         | 429 ------------------
 .../pipelines/ip_adapter/image_projection.py  |   3 +-
 .../ip_adapter/pipeline_ip_adapter.py         |  49 +-
 6 files changed, 446 insertions(+), 451 deletions(-)
 delete mode 100644 src/diffusers/pipelines/ip_adapter/attention_processor.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 46dcf1757c39..6368838ec2ba 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -213,7 +213,6 @@
             "IFPipeline",
             "IFSuperResolutionPipeline",
             "ImageTextPipelineOutput",
-            "StableDiffusionIPAdapterPipeline",
             "KandinskyCombinedPipeline",
             "KandinskyImg2ImgCombinedPipeline",
             "KandinskyImg2ImgPipeline",
@@ -253,6 +252,7 @@
             "StableDiffusionInpaintPipeline",
             "StableDiffusionInpaintPipelineLegacy",
             "StableDiffusionInstructPix2PixPipeline",
+            "StableDiffusionIPAdapterPipeline",
             "StableDiffusionLatentUpscalePipeline",
             "StableDiffusionLDM3DPipeline",
             "StableDiffusionModelEditingPipeline",
@@ -558,7 +558,6 @@
             IFPipeline,
             IFSuperResolutionPipeline,
             ImageTextPipelineOutput,
-            StableDiffusionIPAdapterPipeline,
             KandinskyCombinedPipeline,
             KandinskyImg2ImgCombinedPipeline,
             KandinskyImg2ImgPipeline,
@@ -598,6 +597,7 @@
             StableDiffusionInpaintPipeline,
             StableDiffusionInpaintPipelineLegacy,
             StableDiffusionInstructPix2PixPipeline,
+            StableDiffusionIPAdapterPipeline,
             StableDiffusionLatentUpscalePipeline,
             StableDiffusionLDM3DPipeline,
             StableDiffusionModelEditingPipeline,
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index efed305a0e96..5ae3b4997672 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -1969,6 +1969,412 @@ def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, *args, **k
         return attn.processor(attn, hidden_states, *args, **kwargs)
 
 
+class IPAdapterAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        text_context_len (`int`, defaults to 77):
+            The context length of the text features.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, text_context_len=77, scale=1.0):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.text_context_len = text_context_len
+        self.scale = scale
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale=1.0,
+    ):
+        if scale != 1.0:
+            logger.warning("`scale` of IPAttnProcessor should be set with `IPAdapterPipeline.set_scale`.")
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        # split hidden states
+        encoder_hidden_states, ip_hidden_states = (
+            encoder_hidden_states[:, : self.text_context_len, :],
+            encoder_hidden_states[:, self.text_context_len :, :],
+        )
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+
+        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+        ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class IPAdapterAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for IP-Adapater for PyTorch 2.0.
+
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        text_context_len (`int`, defaults to 77):
+            The context length of the text features.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, text_context_len=77, scale=1.0):
+        super().__init__()
+
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                f"{self.__class__.__name__} requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.text_context_len = text_context_len
+        self.scale = scale
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale=1.0,
+    ):
+        if scale != 1.0:
+            logger.warning("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        # split hidden states
+        encoder_hidden_states, ip_hidden_states = (
+            encoder_hidden_states[:, : self.text_context_len, :],
+            encoder_hidden_states[:, self.text_context_len :, :],
+        )
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        ip_hidden_states = F.scaled_dot_product_attention(
+            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class IPAdapterControlNetAttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+
+    def __init__(self, text_context_len=77):
+        self.text_context_len = text_context_len
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale=1.0,
+    ):
+        if scale != 1.0:
+            logger.warning("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = encoder_hidden_states[:, : self.text_context_len]  # only use text
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class IPAdapterControlNetAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def __init__(self, text_context_len=77):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                f"{self.__class__.__name__} requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+        self.text_context_len = text_context_len
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale=1.0,
+    ):
+        if scale != 1.0:
+            logger.warning("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = encoder_hidden_states[:, : self.text_context_len]
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
 LORA_ATTENTION_PROCESSORS = (
     LoRAAttnProcessor,
     LoRAAttnProcessor2_0,
@@ -1992,6 +2398,8 @@ def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, *args, **k
     LoRAAttnProcessor,
     LoRAAttnProcessor2_0,
     LoRAXFormersAttnProcessor,
+    IPAdapterAttnProcessor,
+    IPAdapterAttnProcessor2_0,
 )
 
 AttentionProcessor = Union[
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 08189f04e984..f164250c1980 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -89,9 +89,7 @@
         "IFPipeline",
         "IFSuperResolutionPipeline",
     ]
-    _import_structure["ip_adapter"] = [
-        "StableDiffusionIPAdapterPipeline",
-    ]
+    _import_structure["ip_adapter"] = ["StableDiffusionIPAdapterPipeline"]
     _import_structure["kandinsky"] = [
         "KandinskyCombinedPipeline",
         "KandinskyImg2ImgCombinedPipeline",
diff --git a/src/diffusers/pipelines/ip_adapter/attention_processor.py b/src/diffusers/pipelines/ip_adapter/attention_processor.py
deleted file mode 100644
index cdff79694e9f..000000000000
--- a/src/diffusers/pipelines/ip_adapter/attention_processor.py
+++ /dev/null
@@ -1,429 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class IPAttnProcessor(nn.Module):
-    r"""
-    Attention processor for IP-Adapater.
-
-    Args:
-        hidden_size (`int`):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`):
-            The number of channels in the `encoder_hidden_states`.
-        text_context_len (`int`, defaults to 77):
-            The context length of the text features.
-        scale (`float`, defaults to 1.0):
-            the weight scale of image prompt.
-    """
-
-    def __init__(self, hidden_size, cross_attention_dim=None, text_context_len=77, scale=1.0):
-        super().__init__()
-
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.text_context_len = text_context_len
-        self.scale = scale
-
-        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-
-    def __call__(
-        self,
-        attn,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-        scale=1.0,
-    ):
-        if scale != 1.0:
-            logger.warning("`scale` of IPAttnProcessor should be set with `IPAdapterPipeline.set_scale`.")
-        residual = hidden_states
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        # split hidden states
-        encoder_hidden_states, ip_hidden_states = (
-            encoder_hidden_states[:, : self.text_context_len, :],
-            encoder_hidden_states[:, self.text_context_len :, :],
-        )
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # for ip-adapter
-        ip_key = self.to_k_ip(ip_hidden_states)
-        ip_value = self.to_v_ip(ip_hidden_states)
-
-        ip_key = attn.head_to_batch_dim(ip_key)
-        ip_value = attn.head_to_batch_dim(ip_value)
-
-        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
-        ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
-        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
-
-        hidden_states = hidden_states + self.scale * ip_hidden_states
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
-class IPAttnProcessor2_0(torch.nn.Module):
-    r"""
-    Attention processor for IP-Adapater for PyTorch 2.0.
-
-    Args:
-        hidden_size (`int`):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`):
-            The number of channels in the `encoder_hidden_states`.
-        text_context_len (`int`, defaults to 77):
-            The context length of the text features.
-        scale (`float`, defaults to 1.0):
-            the weight scale of image prompt.
-    """
-
-    def __init__(self, hidden_size, cross_attention_dim=None, text_context_len=77, scale=1.0):
-        super().__init__()
-
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(
-                f"{self.__class__.__name__} requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
-            )
-
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.text_context_len = text_context_len
-        self.scale = scale
-
-        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-
-    def __call__(
-        self,
-        attn,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-        scale=1.0,
-    ):
-        if scale != 1.0:
-            logger.warning("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
-        residual = hidden_states
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        # split hidden states
-        encoder_hidden_states, ip_hidden_states = (
-            encoder_hidden_states[:, : self.text_context_len, :],
-            encoder_hidden_states[:, self.text_context_len :, :],
-        )
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
-
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-
-        # for ip-adapter
-        ip_key = self.to_k_ip(ip_hidden_states)
-        ip_value = self.to_v_ip(ip_hidden_states)
-
-        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        ip_hidden_states = F.scaled_dot_product_attention(
-            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
-        )
-
-        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        ip_hidden_states = ip_hidden_states.to(query.dtype)
-
-        hidden_states = hidden_states + self.scale * ip_hidden_states
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
-## for ControlNet
-class CNAttnProcessor:
-    r"""
-    Default processor for performing attention-related computations.
-    """
-
-    def __init__(self, text_context_len=77):
-        self.text_context_len = text_context_len
-
-    def __call__(
-        self,
-        attn,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-        scale=1.0,
-    ):
-        if scale != 1.0:
-            logger.warning("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
-        residual = hidden_states
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = encoder_hidden_states[:, : self.text_context_len]  # only use text
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
-class CNAttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
-    """
-
-    def __init__(self, text_context_len=77):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(
-                f"{self.__class__.__name__} requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
-            )
-        self.text_context_len = text_context_len
-
-    def __call__(
-        self,
-        attn,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-        scale=1.0,
-    ):
-        if scale != 1.0:
-            logger.warning("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
-        residual = hidden_states
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = encoder_hidden_states[:, : self.text_context_len]
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
-
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
diff --git a/src/diffusers/pipelines/ip_adapter/image_projection.py b/src/diffusers/pipelines/ip_adapter/image_projection.py
index 37077bf042d1..e90b95214d24 100644
--- a/src/diffusers/pipelines/ip_adapter/image_projection.py
+++ b/src/diffusers/pipelines/ip_adapter/image_projection.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from torch import nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
@@ -23,7 +22,7 @@ class ImageProjectionModel(ModelMixin, ConfigMixin):
     """Image Projection Model."""
 
     @register_to_config
-    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
+    def __init__(self, cross_attention_dim=768, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
         super().__init__()
 
         self.cross_attention_dim = cross_attention_dim
diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
index 9f6893ae7ae6..cce7a6c2a5a9 100644
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
@@ -23,14 +23,18 @@
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.attention_processor import AttnProcessor, AttnProcessor2_0
+from ...models.attention_processor import (
+    AttnProcessor,
+    AttnProcessor2_0,
+    IPAdapterAttnProcessor,
+    IPAdapterAttnProcessor2_0,
+)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import DIFFUSERS_CACHE, HF_HUB_OFFLINE, _get_model_file, logging
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion import StableDiffusionPipelineOutput
-from .attention_processor import IPAttnProcessor, IPAttnProcessor2_0
 from .image_projection import ImageProjectionModel
 
 
@@ -59,11 +63,9 @@ def __init__(
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        image_projection: ImageProjectionModel,
-        image_encoder: CLIPVisionModelWithProjection,
         ip_adapter_image_processor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
         scheduler: KarrasDiffusionSchedulers,
-        _initialize_ip_adapter_modules: bool = True
     ):
         super().__init__()
 
@@ -74,14 +76,10 @@ def __init__(
             text_encoder=text_encoder,
             image_encoder=image_encoder,
             ip_adapter_image_processor=ip_adapter_image_processor,
-            image_projection=image_projection,
             scheduler=scheduler,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
-        
-        if _initialize_ip_adapter_modules:
-            self._set_ip_adapter()
 
     def _set_ip_adapter(self):
         unet = self.unet
@@ -103,11 +101,12 @@ def _set_ip_adapter(self):
                 attn_procs[name] = attn_processor_class()
             else:
                 attn_processor_class = (
-                    IPAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAttnProcessor
+                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
                 )
                 attn_procs[name] = attn_processor_class(
                     hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
-                )
+                ).to(dtype=unet.dtype, device=unet.device)
+
         unet.set_attn_processor(attn_procs)
 
         # TODO: create a separate pipeline for this: `StableDiffusionControlNetIPAdapterPipeline`.
@@ -189,8 +188,9 @@ def load_ip_adapter(
             subfolder (`str`, *optional*, defaults to `""`):
                 The subfolder location of a model file within a larger model repository on the Hub or locally.
         """
-        # Load the main state dict first which has the LoRA layers for either of
-        # UNet and text encoder or both.
+        self._set_ip_adapter()
+
+        # Load the main state dict first/
         cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
@@ -223,6 +223,22 @@ def load_ip_adapter(
             state_dict = torch.load(model_file, map_location="cpu")
         else:
             state_dict = pretrained_model_name_or_path_or_dict
+
+        keys = list(state_dict.keys())
+        if keys != ["image_proj", "ip_adapter"]:
+            raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing.")
+
+        # Handle image projection layers.
+        clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
+        cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
+        image_projection = ImageProjectionModel(
+            cross_attention_dim=cross_attention_dim, clip_embeddings_dim=clip_embeddings_dim
+        )
+        image_projection.to(dtype=self.unet.dtype, device=self.unet.device)
+        image_projection.load_state_dict(state_dict["image_proj"])
+        self.image_projection = image_projection
+
+        # Handle IP-Adapter cross-attention layers.
         ip_layers = torch.nn.ModuleList(
             [
                 module if isinstance(module, nn.Module) else nn.Identity()
@@ -230,11 +246,10 @@ def load_ip_adapter(
             ]
         )
         ip_layers.load_state_dict(state_dict["ip_adapter"])
-        ip_layers.to(device=self.unet.device, dtype=self.unet.dtype)
 
     def set_scale(self, scale):
         for attn_processor in self.unet.attn_processors.values():
-            if isinstance(attn_processor, (IPAttnProcessor, IPAttnProcessor2_0)):
+            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
                 attn_processor.scale = scale
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
@@ -481,6 +496,10 @@ def __call__(
         self.set_scale(ip_adapter_scale)
 
         # 1. Check inputs and raise error if needed.
+        if hasattr(self, "image_projection") and getattr(self, "image_projection") is None:
+            raise (
+                "This pipeline cannot be called without having an `image_projection` module. Did you call `load_ip_adapter()` before running the pipeline?"
+            )
         # TODO
 
         # 1. Define call parameters

From 1d64cb847f968bf2dec8943069a5f7bfbe48ee01 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 7 Nov 2023 19:11:15 +0000
Subject: [PATCH 018/139]  add image_encoder to sd as optional components

---
 .../pipeline_stable_diffusion.py              | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 3b86da9ad54a..b7b5e4e38499 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -17,7 +17,7 @@
 
 import torch
 from packaging import version
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...configuration_utils import FrozenDict
 from ...image_processor import VaeImageProcessor
@@ -103,7 +103,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor"]
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
     _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
@@ -111,6 +111,7 @@ def __init__(
         vae: AutoencoderKL,
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
+        image_encoder: CLIPVisionModelWithProjection,
         unet: UNet2DConditionModel,
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
@@ -191,6 +192,7 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -438,6 +440,19 @@ def encode_prompt(
 
         return prompt_embeds, negative_prompt_embeds
 
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is None:
             has_nsfw_concept = None

From 70fae5c28871eb9ebed15b361c11209f3faa0938 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 7 Nov 2023 22:21:02 +0000
Subject: [PATCH 019/139] add image_prompt arg

---
 .../stable_diffusion/pipeline_stable_diffusion.py    | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index b7b5e4e38499..cbc62562055c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -20,7 +20,7 @@
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...configuration_utils import FrozenDict
-from ...image_processor import VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
@@ -590,6 +590,7 @@ def disable_freeu(self):
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
+        image_prompt: PipelineImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 50,
@@ -720,6 +721,11 @@ def __call__(
         # to avoid doing two forward passes
         if do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        
+        if image_prompt is not None:
+            image_embeds, negative_image_embeds = self.image_encoder(image, device, num_images_per_prompt)
+            if do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -749,12 +755,16 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
+                if image_prompt is not None:
+                    added_cond_kwags = {"image_embeds": image_embeds}
+
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
                     cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
                     return_dict=False,
                 )[0]
 

From 3aaaa236c056f6b7b22f2bf969b667962bd640fd Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 8 Nov 2023 00:42:56 +0000
Subject: [PATCH 020/139] move image_projection to unet, refactor

---
 src/diffusers/models/unet_2d_condition.py     |   7 +
 .../pipeline_stable_diffusion.py              | 189 +++++++++++++++++-
 2 files changed, 194 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index f248b243f376..65bd17d08ce6 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -1022,6 +1022,13 @@ def forward(
                 )
             image_embeds = added_cond_kwargs.get("image_embeds")
             encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif self.encoder_hid_proj is not None and any("to_k_ip" in k for k in self.state_dict().keys()):
+    
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = image_embeds.to(encoder_hidden_states.dtype)
+            image_embeds = self.encoder_hid_proj(image_embeds)
+            encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
+
         # 2. pre-process
         sample = self.conv_in(sample)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index cbc62562055c..a570cb4d1b7a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -39,6 +39,20 @@
 from .safety_checker import StableDiffusionSafetyChecker
 
 
+## ip-adapter related imports 
+from ...models.embeddings import ImageProjection
+import torch.nn.functional as F
+
+from ...models.attention_processor import (
+    AttnProcessor,
+    AttnProcessor2_0,
+    IPAdapterAttnProcessor,
+    IPAdapterAttnProcessor2_0,
+)
+
+from ...utils import DIFFUSERS_CACHE, HF_HUB_OFFLINE, _get_model_file
+from torch import nn
+
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 EXAMPLE_DOC_STRING = """
@@ -723,7 +737,7 @@ def __call__(
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
         
         if image_prompt is not None:
-            image_embeds, negative_image_embeds = self.image_encoder(image, device, num_images_per_prompt)
+            image_embeds, negative_image_embeds = self.encode_image(image_prompt, device, num_images_per_prompt)
             if do_classifier_free_guidance:
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
@@ -756,7 +770,7 @@ def __call__(
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 if image_prompt is not None:
-                    added_cond_kwags = {"image_embeds": image_embeds}
+                    added_cond_kwargs = {"image_embeds": image_embeds}
 
                 # predict the noise residual
                 noise_pred = self.unet(
@@ -808,3 +822,174 @@ def __call__(
             return (image, has_nsfw_concept)
 
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    # yiyi notes and todo: put here for testing for now, make it mixin later 
+    def _set_ip_adapter(self):
+        unet = self.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_processor_class = (
+                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                )
+                attn_procs[name] = attn_processor_class()
+            else:
+                attn_processor_class = (
+                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
+                )
+                attn_procs[name] = attn_processor_class(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
+                ).to(dtype=unet.dtype, device=unet.device)
+
+        unet.set_attn_processor(attn_procs)
+
+        # TODO: create a separate pipeline for this: `StableDiffusionControlNetIPAdapterPipeline`.
+        # if hasattr(self.pipeline, "controlnet"):
+        #     attn_processor_class = (
+        #         CNAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else CNAttnProcessor
+        #     )
+        #     self.pipeline.controlnet.set_attn_processor(attn_processor_class())
+
+        # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # yiyi notes and todo: put here for testing for now, make it mixin later 
+    def load_ip_adapter(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        """
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+        """
+        self._set_ip_adapter()
+
+        # Load the main state dict first/
+        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+
+        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+            model_file = _get_model_file(
+                pretrained_model_name_or_path_or_dict,
+                weights_name=weight_name,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+            )
+            state_dict = torch.load(model_file, map_location="cpu")
+        else:
+            state_dict = pretrained_model_name_or_path_or_dict
+
+        keys = list(state_dict.keys())
+        if keys != ["image_proj", "ip_adapter"]:
+            raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing.")
+
+        # Handle image projection layers.
+        clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
+        cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
+        image_projection = ImageProjection(
+            cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim, num_image_text_embeds=4
+        )
+        image_projection.to(dtype=self.unet.dtype, device=self.unet.device)
+        
+        # yiyi notes: move this section to conversion script!
+        diffusers_state_dict = {}
+
+        diffusers_state_dict.update(
+            {
+                "image_embeds.weight": state_dict["image_proj"]["proj.weight"],
+                "image_embeds.bias": state_dict["image_proj"]["proj.bias"],
+                "norm.weight": state_dict["image_proj"]["norm.weight"],
+                "norm.bias": state_dict["image_proj"]["norm.bias"],
+            }
+        )
+        
+        image_projection.load_state_dict(diffusers_state_dict)
+        # yiyi notes: this actually changes the unet config, have to refactor! 
+        self.unet.encoder_hid_proj = image_projection
+        self.unet.encoder_hid_proj.to(self.unet.device, self.unet.dtype)
+
+        # Handle IP-Adapter cross-attention layers.
+        ip_layers = torch.nn.ModuleList(
+            [
+                module if isinstance(module, nn.Module) else nn.Identity()
+                for module in self.unet.attn_processors.values()
+            ]
+        )
+        ip_layers.load_state_dict(state_dict["ip_adapter"])
+    
+    # yiyi notes and todo: put here for testing for now, make it mixin later 
+    def set_scale(self, scale):
+        for attn_processor in self.unet.attn_processors.values():
+            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
+                attn_processor.scale = scale
+
+
+def assert_param_count(model_1, model_2):
+    count_1 = sum(p.numel() for p in model_1.parameters())
+    count_2 = sum(p.numel() for p in model_2.parameters())
+    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"

From 2154d0190c2eb78a1f6be02e2b93511f2b10a721 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 8 Nov 2023 01:50:39 +0000
Subject: [PATCH 021/139] update comments

---
 .../stable_diffusion/pipeline_stable_diffusion.py   | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index a570cb4d1b7a..28dd637b2ad2 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -39,7 +39,7 @@
 from .safety_checker import StableDiffusionSafetyChecker
 
 
-## ip-adapter related imports 
+## YiYi notes: ip-adapter related imports, will move to mixin file if needed 
 from ...models.embeddings import ImageProjection
 import torch.nn.functional as F
 
@@ -951,12 +951,13 @@ def load_ip_adapter(
         # Handle image projection layers.
         clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
         cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
+
+        # yiyi notes: we use `ImageProjection` class in diffusers instead, and directly updated `unet.encoder_hid_proj`
         image_projection = ImageProjection(
             cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim, num_image_text_embeds=4
         )
         image_projection.to(dtype=self.unet.dtype, device=self.unet.device)
         
-        # yiyi notes: move this section to conversion script!
         diffusers_state_dict = {}
 
         diffusers_state_dict.update(
@@ -969,7 +970,7 @@ def load_ip_adapter(
         )
         
         image_projection.load_state_dict(diffusers_state_dict)
-        # yiyi notes: this actually changes the unet config, have to refactor! 
+        # yiyi notes: this actually changes the unet config, need to refactor! 
         self.unet.encoder_hid_proj = image_projection
         self.unet.encoder_hid_proj.to(self.unet.device, self.unet.dtype)
 
@@ -987,9 +988,3 @@ def set_scale(self, scale):
         for attn_processor in self.unet.attn_processors.values():
             if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
                 attn_processor.scale = scale
-
-
-def assert_param_count(model_1, model_2):
-    count_1 = sum(p.numel() for p in model_1.parameters())
-    count_2 = sum(p.numel() for p in model_2.parameters())
-    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"

From bc528109c54ca69c7cbf5a45101fbcc6f2a65088 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 09:39:10 +0530
Subject: [PATCH 022/139] make image_encoder default to None.

---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index d62f65ac2fa3..6a18eb604ef6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -126,11 +126,11 @@ def __init__(
         vae: AutoencoderKL,
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
-        image_encoder: CLIPVisionModelWithProjection,
         unet: UNet2DConditionModel,
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
         requires_safety_checker: bool = True,
     ):
         super().__init__()

From eaf94bb2c915bfbcbb8ae8bfeccd53b5034aa566 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 09:49:00 +0530
Subject: [PATCH 023/139] fully delegate the image encoding logic.

---
 src/diffusers/models/unet_2d_condition.py     |  5 ----
 .../pipeline_stable_diffusion.py              | 29 +++++++++++++------
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 739470b4f51d..da1f91280a11 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -1022,11 +1022,6 @@ def forward(
                 )
             image_embeds = added_cond_kwargs.get("image_embeds")
             encoder_hidden_states = self.encoder_hid_proj(image_embeds)
-        elif self.encoder_hid_proj is not None and any("to_k_ip" in k for k in self.state_dict().keys()):
-            image_embeds = added_cond_kwargs.get("image_embeds")
-            image_embeds = image_embeds.to(encoder_hidden_states.dtype)
-            image_embeds = self.encoder_hid_proj(image_embeds)
-            encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
 
         # 2. pre-process
         sample = self.conv_in(sample)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 6a18eb604ef6..234caeef3432 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -455,7 +455,9 @@ def encode_prompt(
 
         return prompt_embeds, negative_prompt_embeds
 
-    def encode_image(self, image, device, num_images_per_prompt):
+    # Note (sayakpaul): Name it this way to not mess up with other functions like _encode_image()
+    # common in imag2image pipelines.
+    def encode_image_ip_adapter(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
 
         if not isinstance(image, torch.Tensor):
@@ -463,10 +465,16 @@ def encode_image(self, image, device, num_images_per_prompt):
 
         image = image.to(device=device, dtype=dtype)
         image_embeds = self.image_encoder(image).image_embeds
-        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        uncond_image_prompt_embeds = self.image_projection(torch.zeros_like(image_embeds))
 
-        uncond_image_embeds = torch.zeros_like(image_embeds)
-        return image_embeds, uncond_image_embeds
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeds.shape
+        image_embeds = image_embeds.repeat(1, num_images_per_prompt, 1)
+        image_embeds = image_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        return image_embeds, uncond_image_prompt_embeds
 
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is None:
@@ -794,17 +802,20 @@ def __call__(
             lora_scale=lora_scale,
             clip_skip=self.clip_skip,
         )
+
+        if image_prompt is not None:
+            image_embeds, negative_image_embeds = self.encode_image_ip_adapter(
+                image_prompt, device, num_images_per_prompt
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, negative_image_embeds], dim=1)
+
         # For classifier free guidance, we need to do two forward passes.
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
         if self.do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
-        if image_prompt is not None:
-            image_embeds, negative_image_embeds = self.encode_image(image_prompt, device, num_images_per_prompt)
-            if self.do_classifier_free_guidance:
-                image_embeds = torch.cat([negative_image_embeds, image_embeds])
-
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps

From 7cf7f707a9c7473b730271ef215ac41922353981 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 09:58:25 +0530
Subject: [PATCH 024/139] debug

---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 234caeef3432..5d2988d4fa10 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -458,12 +458,14 @@ def encode_prompt(
     # Note (sayakpaul): Name it this way to not mess up with other functions like _encode_image()
     # common in imag2image pipelines.
     def encode_image_ip_adapter(self, image, device, num_images_per_prompt):
+        print(f"Inside encode_image_ip_adapter: {self.image_encoder.device}, {self.image_encoder.dtype}")
         dtype = next(self.image_encoder.parameters()).dtype
 
         if not isinstance(image, torch.Tensor):
             image = self.feature_extractor(image, return_tensors="pt").pixel_values
 
         image = image.to(device=device, dtype=dtype)
+        print(f"Inside encode_image_ip_adapter: {image.device}, {image.dtype}")
         image_embeds = self.image_encoder(image).image_embeds
         uncond_image_prompt_embeds = self.image_projection(torch.zeros_like(image_embeds))
 

From 03e2961c4b4fbe654b4e5e5ad4be9facab717a63 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 10:03:12 +0530
Subject: [PATCH 025/139] fix

---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 5d2988d4fa10..73529f036100 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -1059,8 +1059,7 @@ def load_ip_adapter(
 
         image_projection.load_state_dict(diffusers_state_dict)
         # yiyi notes: this actually changes the unet config, need to refactor!
-        self.unet.encoder_hid_proj = image_projection
-        self.unet.encoder_hid_proj.to(self.unet.device, self.unet.dtype)
+        self.image_projection = image_projection.to(device=self.unet.device, dtype=self.unet.dtype)
 
         # Handle IP-Adapter cross-attention layers.
         ip_layers = torch.nn.ModuleList(

From 982a557ee9d91ba407e014bf660f622af816dc51 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 10:05:35 +0530
Subject: [PATCH 026/139] fix

---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 73529f036100..e1c3bb19793a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -211,6 +211,7 @@ def __init__(
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.image_projection = None
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def enable_vae_slicing(self):

From 6059099200572e6b8de92b6be110107c8756d261 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 10:07:43 +0530
Subject: [PATCH 027/139] fix:

---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py   | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index e1c3bb19793a..03ce6720935b 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -211,7 +211,6 @@ def __init__(
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-        self.image_projection = None
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def enable_vae_slicing(self):
@@ -459,15 +458,14 @@ def encode_prompt(
     # Note (sayakpaul): Name it this way to not mess up with other functions like _encode_image()
     # common in imag2image pipelines.
     def encode_image_ip_adapter(self, image, device, num_images_per_prompt):
-        print(f"Inside encode_image_ip_adapter: {self.image_encoder.device}, {self.image_encoder.dtype}")
         dtype = next(self.image_encoder.parameters()).dtype
 
         if not isinstance(image, torch.Tensor):
             image = self.feature_extractor(image, return_tensors="pt").pixel_values
 
         image = image.to(device=device, dtype=dtype)
-        print(f"Inside encode_image_ip_adapter: {image.device}, {image.dtype}")
         image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = self.image_projection(image_embeds)
         uncond_image_prompt_embeds = self.image_projection(torch.zeros_like(image_embeds))
 
         # duplicate image embeddings for each generation per prompt, using mps friendly method

From c56503b4a87f013f6078359d363282bf48cacc5d Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 10:17:10 +0530
Subject: [PATCH 028/139] fix

---
 .../pipeline_stable_diffusion.py               | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 03ce6720935b..5538b9a55bd8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -465,17 +465,19 @@ def encode_image_ip_adapter(self, image, device, num_images_per_prompt):
 
         image = image.to(device=device, dtype=dtype)
         image_embeds = self.image_encoder(image).image_embeds
-        image_embeds = self.image_projection(image_embeds)
-        uncond_image_prompt_embeds = self.image_projection(torch.zeros_like(image_embeds))
+        projected_image_embeds = self.image_projection(image_embeds)
+        uncond_projected_image_embeds = self.image_projection(torch.zeros_like(image_embeds))
 
         # duplicate image embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = image_embeds.shape
-        image_embeds = image_embeds.repeat(1, num_images_per_prompt, 1)
-        image_embeds = image_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = projected_image_embeds.shape
+        projected_image_embeds = projected_image_embeds.repeat(1, num_images_per_prompt, 1)
+        projected_image_embeds = projected_image_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        uncond_projected_image_embeds.repeat(1, num_images_per_prompt, 1)
+        uncond_projected_image_embeds = uncond_projected_image_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
 
-        return image_embeds, uncond_image_prompt_embeds
+        return projected_image_embeds, uncond_projected_image_embeds
 
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is None:

From 59c933ae35859f79d0ce41a3e84c2df2f16bf632 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 10:55:04 +0530
Subject: [PATCH 029/139] separate the loacder.

---
 src/diffusers/loaders.py                      | 225 ++++++++++++++++++
 .../pipeline_stable_diffusion.py              | 186 +--------------
 2 files changed, 230 insertions(+), 181 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 2fa1c61fd809..b20761ca8c94 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -22,11 +22,21 @@
 import requests
 import safetensors
 import torch
+import torch.nn.functional as F
 from huggingface_hub import hf_hub_download, model_info
 from packaging import version
 from torch import nn
 
 from . import __version__
+from .models.attention_processor import (
+    AttnProcessor,
+    AttnProcessor2_0,
+    IPAdapterAttnProcessor,
+    IPAdapterAttnProcessor2_0,
+    IPAdapterControlNetAttnProcessor,
+    IPAdapterControlNetAttnProcessor2_0,
+)
+from .models.embeddings import ImageProjection
 from .models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
 from .utils import (
     DIFFUSERS_CACHE,
@@ -72,6 +82,9 @@
 CUSTOM_DIFFUSION_WEIGHT_NAME = "pytorch_custom_diffusion_weights.bin"
 CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE = "pytorch_custom_diffusion_weights.safetensors"
 
+IP_ADAPTER_WEIGHT_NAME = "pytorch_ip_adapter_weights.bin"
+IP_ADAPTER_WEIGHT_NAME_SAFE = "pytorch_ip_adapter_weights.safetensors"
+
 LORA_DEPRECATION_MESSAGE = "You are using an old version of LoRA backend. This will be deprecated in the next releases in favor of PEFT make sure to install the latest PEFT and transformers packages in the future."
 
 
@@ -3329,3 +3342,215 @@ def _remove_text_encoder_monkey_patch(self):
         else:
             self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
             self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)
+
+
+class IPAdapterMixin:
+    """Mixin for handling IP Adapters."""
+
+    def set_ip_adapter(self):
+        unet = self.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_processor_class = (
+                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                )
+                attn_procs[name] = attn_processor_class()
+            else:
+                attn_processor_class = (
+                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
+                )
+                attn_procs[name] = attn_processor_class(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
+                ).to(dtype=unet.dtype, device=unet.device)
+
+        unet.set_attn_processor(attn_procs)
+
+        if hasattr(self, "controlnet"):
+            attn_processor_class = (
+                IPAdapterControlNetAttnProcessor2_0
+                if hasattr(F, "scaled_dot_product_attention")
+                else IPAdapterControlNetAttnProcessor
+            )
+            self.pipeline.controlnet.set_attn_processor(attn_processor_class())
+
+    def load_ip_adapter(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        """
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+        """
+        if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
+            raise ValueError("`image_encoder` cannot be None when using IP Adapters.")
+
+        self.set_ip_adapter()
+
+        # Load the main state dict first.
+        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        # TODO (sayakpaul): incorporate safetensors
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+
+        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+            model_file = _get_model_file(
+                pretrained_model_name_or_path_or_dict,
+                weights_name=weight_name,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+            )
+            state_dict = torch.load(model_file, map_location="cpu")
+        else:
+            state_dict = pretrained_model_name_or_path_or_dict
+
+        keys = list(state_dict.keys())
+        if keys != ["image_proj", "ip_adapter"]:
+            raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")
+
+        # Handle image projection layers.
+        clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
+        cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
+
+        image_projection = ImageProjection(
+            cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim, num_image_text_embeds=4
+        )
+        image_projection.to(dtype=self.unet.dtype, device=self.unet.device)
+
+        diffusers_state_dict = {}
+
+        diffusers_state_dict.update(
+            {
+                "image_embeds.weight": state_dict["image_proj"]["proj.weight"],
+                "image_embeds.bias": state_dict["image_proj"]["proj.bias"],
+                "norm.weight": state_dict["image_proj"]["norm.weight"],
+                "norm.bias": state_dict["image_proj"]["norm.bias"],
+            }
+        )
+
+        image_projection.load_state_dict(diffusers_state_dict)
+        self.image_projection = image_projection.to(device=self.unet.device, dtype=self.unet.dtype)
+
+        # Handle IP-Adapter cross-attention layers.
+        ip_layers = torch.nn.ModuleList(
+            [
+                module if isinstance(module, nn.Module) else nn.Identity()
+                for module in self.unet.attn_processors.values()
+            ]
+        )
+        ip_layers.load_state_dict(state_dict["ip_adapter"])
+
+    def set_ip_adapter_scale(self, scale):
+        for attn_processor in self.unet.attn_processors.values():
+            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
+                attn_processor.scale = scale
+
+    @classmethod
+    def save_ip_adapter(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        image_projection: Union[nn.Module, Dict[str, torch.Tensor]] = None,
+        crorss_attention_modules: Dict[str, torch.Tensor] = None,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+    ):
+        # Save the model
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        # Create a flat dictionary.
+        state_dict = {}
+
+        # Populate the dictionary.
+        if isinstance(image_projection, nn.Module):
+            state_dict.update({"image_proj": image_projection.state_dict()})
+        elif isinstance(image_projection, dict):
+            state_dict.update(image_projection)
+        else:
+            raise ValueError(
+                "Invalid input provided for `image_projection`. It can either be an `nn.Module` or a state dictionary."
+            )
+
+        state_dict.update({"ip_adapter": crorss_attention_modules})
+
+        if save_function is None:
+            if safe_serialization:
+
+                def save_function(weights, filename):
+                    return safetensors.torch.save_file(weights, filename, metadata={"format": "pt"})
+
+            else:
+                save_function = torch.save
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if weight_name is None:
+            if safe_serialization:
+                weight_name = IP_ADAPTER_WEIGHT_NAME_SAFE
+            else:
+                weight_name = IP_ADAPTER_WEIGHT_NAME
+
+        save_function(state_dict, os.path.join(save_directory, weight_name))
+        logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 5538b9a55bd8..d209fda2e1bc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -16,31 +16,17 @@
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
-import torch.nn.functional as F
 from packaging import version
-from torch import nn
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...configuration_utils import FrozenDict
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor,
-    AttnProcessor2_0,
-    IPAdapterAttnProcessor,
-    IPAdapterAttnProcessor2_0,
-)
-
-## YiYi notes: ip-adapter related imports, will move to mixin file if needed
-from ...models.embeddings import ImageProjection
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
-    DIFFUSERS_CACHE,
-    HF_HUB_OFFLINE,
     USE_PEFT_BACKEND,
-    _get_model_file,
     deprecate,
     logging,
     replace_example_docstring,
@@ -84,7 +70,9 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
     return noise_cfg
 
 
-class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin):
+class StableDiffusionPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion.
 
@@ -96,6 +84,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
     Args:
         vae ([`AutoencoderKL`]):
@@ -911,168 +900,3 @@ def __call__(
             return (image, has_nsfw_concept)
 
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    # yiyi notes and todo: put here for testing for now, make it mixin later
-    def _set_ip_adapter(self):
-        unet = self.unet
-        attn_procs = {}
-        for name in unet.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = unet.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = unet.config.block_out_channels[block_id]
-            if cross_attention_dim is None:
-                attn_processor_class = (
-                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
-                )
-                attn_procs[name] = attn_processor_class()
-            else:
-                attn_processor_class = (
-                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
-                )
-                attn_procs[name] = attn_processor_class(
-                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
-                ).to(dtype=unet.dtype, device=unet.device)
-
-        unet.set_attn_processor(attn_procs)
-
-        # TODO: create a separate pipeline for this: `StableDiffusionControlNetIPAdapterPipeline`.
-        # if hasattr(self.pipeline, "controlnet"):
-        #     attn_processor_class = (
-        #         CNAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else CNAttnProcessor
-        #     )
-        #     self.pipeline.controlnet.set_attn_processor(attn_processor_class())
-
-        # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
-        r"""
-        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_slicing()
-
-    # yiyi notes and todo: put here for testing for now, make it mixin later
-    def load_ip_adapter(
-        self,
-        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
-        **kwargs,
-    ):
-        """
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-        """
-        self._set_ip_adapter()
-
-        # Load the main state dict first/
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        weight_name = kwargs.pop("weight_name", None)
-
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch",
-        }
-
-        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
-            model_file = _get_model_file(
-                pretrained_model_name_or_path_or_dict,
-                weights_name=weight_name,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                subfolder=subfolder,
-                user_agent=user_agent,
-            )
-            state_dict = torch.load(model_file, map_location="cpu")
-        else:
-            state_dict = pretrained_model_name_or_path_or_dict
-
-        keys = list(state_dict.keys())
-        if keys != ["image_proj", "ip_adapter"]:
-            raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing.")
-
-        # Handle image projection layers.
-        clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
-        cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
-
-        # yiyi notes: we use `ImageProjection` class in diffusers instead, and directly updated `unet.encoder_hid_proj`
-        image_projection = ImageProjection(
-            cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim, num_image_text_embeds=4
-        )
-        image_projection.to(dtype=self.unet.dtype, device=self.unet.device)
-
-        diffusers_state_dict = {}
-
-        diffusers_state_dict.update(
-            {
-                "image_embeds.weight": state_dict["image_proj"]["proj.weight"],
-                "image_embeds.bias": state_dict["image_proj"]["proj.bias"],
-                "norm.weight": state_dict["image_proj"]["norm.weight"],
-                "norm.bias": state_dict["image_proj"]["norm.bias"],
-            }
-        )
-
-        image_projection.load_state_dict(diffusers_state_dict)
-        # yiyi notes: this actually changes the unet config, need to refactor!
-        self.image_projection = image_projection.to(device=self.unet.device, dtype=self.unet.dtype)
-
-        # Handle IP-Adapter cross-attention layers.
-        ip_layers = torch.nn.ModuleList(
-            [
-                module if isinstance(module, nn.Module) else nn.Identity()
-                for module in self.unet.attn_processors.values()
-            ]
-        )
-        ip_layers.load_state_dict(state_dict["ip_adapter"])
-
-    # yiyi notes and todo: put here for testing for now, make it mixin later
-    def set_scale(self, scale):
-        for attn_processor in self.unet.attn_processors.values():
-            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
-                attn_processor.scale = scale

From 7ece033a0f219ca47a22831dc5f212e499bc1c4c Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 11:02:33 +0530
Subject: [PATCH 030/139] circular import problem

---
 src/diffusers/loaders.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index b20761ca8c94..97eef763df78 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -28,14 +28,6 @@
 from torch import nn
 
 from . import __version__
-from .models.attention_processor import (
-    AttnProcessor,
-    AttnProcessor2_0,
-    IPAdapterAttnProcessor,
-    IPAdapterAttnProcessor2_0,
-    IPAdapterControlNetAttnProcessor,
-    IPAdapterControlNetAttnProcessor2_0,
-)
 from .models.embeddings import ImageProjection
 from .models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
 from .utils import (
@@ -3348,6 +3340,15 @@ class IPAdapterMixin:
     """Mixin for handling IP Adapters."""
 
     def set_ip_adapter(self):
+        from .models.attention_processor import (
+            AttnProcessor,
+            AttnProcessor2_0,
+            IPAdapterAttnProcessor,
+            IPAdapterAttnProcessor2_0,
+            IPAdapterControlNetAttnProcessor,
+            IPAdapterControlNetAttnProcessor2_0,
+        )
+
         unet = self.unet
         attn_procs = {}
         for name in unet.attn_processors.keys():
@@ -3501,6 +3502,8 @@ def load_ip_adapter(
         ip_layers.load_state_dict(state_dict["ip_adapter"])
 
     def set_ip_adapter_scale(self, scale):
+        from .models.attention_processor import IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0
+
         for attn_processor in self.unet.attn_processors.values():
             if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
                 attn_processor.scale = scale

From 4cb0432a0519554a21010363d27530a15e88a770 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 11:05:28 +0530
Subject: [PATCH 031/139] circular imports.

---
 src/diffusers/loaders.py                    | 19 ++++++++-----------
 src/diffusers/models/attention_processor.py |  4 ++--
 src/diffusers/models/lora.py                |  3 ++-
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 97eef763df78..b20761ca8c94 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -28,6 +28,14 @@
 from torch import nn
 
 from . import __version__
+from .models.attention_processor import (
+    AttnProcessor,
+    AttnProcessor2_0,
+    IPAdapterAttnProcessor,
+    IPAdapterAttnProcessor2_0,
+    IPAdapterControlNetAttnProcessor,
+    IPAdapterControlNetAttnProcessor2_0,
+)
 from .models.embeddings import ImageProjection
 from .models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
 from .utils import (
@@ -3340,15 +3348,6 @@ class IPAdapterMixin:
     """Mixin for handling IP Adapters."""
 
     def set_ip_adapter(self):
-        from .models.attention_processor import (
-            AttnProcessor,
-            AttnProcessor2_0,
-            IPAdapterAttnProcessor,
-            IPAdapterAttnProcessor2_0,
-            IPAdapterControlNetAttnProcessor,
-            IPAdapterControlNetAttnProcessor2_0,
-        )
-
         unet = self.unet
         attn_procs = {}
         for name in unet.attn_processors.keys():
@@ -3502,8 +3501,6 @@ def load_ip_adapter(
         ip_layers.load_state_dict(state_dict["ip_adapter"])
 
     def set_ip_adapter_scale(self, scale):
-        from .models.attention_processor import IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0
-
         for attn_processor in self.unet.attn_processors.values():
             if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
                 attn_processor.scale = scale
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 03c334437fef..73a16bcb2f05 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -2009,7 +2009,7 @@ def __call__(
         scale=1.0,
     ):
         if scale != 1.0:
-            logger.warning("`scale` of IPAttnProcessor should be set with `IPAdapterPipeline.set_scale`.")
+            logger.warning("`scale` of IPAttnProcessor should be set with `set_ip_adapter_scale`.")
         residual = hidden_states
 
         if attn.spatial_norm is not None:
@@ -2123,7 +2123,7 @@ def __call__(
         scale=1.0,
     ):
         if scale != 1.0:
-            logger.warning("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
+            logger.warning("`scale` of IPAttnProcessor should be set by `set_ip_adapter_scale`.")
         residual = hidden_states
 
         if attn.spatial_norm is not None:
diff --git a/src/diffusers/models/lora.py b/src/diffusers/models/lora.py
index a143c17458ad..fa4ab343a0f4 100644
--- a/src/diffusers/models/lora.py
+++ b/src/diffusers/models/lora.py
@@ -18,7 +18,6 @@
 import torch.nn.functional as F
 from torch import nn
 
-from ..loaders import PatchedLoraProjection, text_encoder_attn_modules, text_encoder_mlp_modules
 from ..utils import logging
 
 
@@ -26,6 +25,8 @@
 
 
 def adjust_lora_scale_text_encoder(text_encoder, lora_scale: float = 1.0):
+    from ..loaders import PatchedLoraProjection, text_encoder_attn_modules, text_encoder_mlp_modules
+
     for _, attn_module in text_encoder_attn_modules(text_encoder):
         if isinstance(attn_module.q_proj, PatchedLoraProjection):
             attn_module.q_proj.lora_scale = lora_scale

From 17223d406c4dcb7d707703b143f1e35de6d077ba Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 11:13:48 +0530
Subject: [PATCH 032/139] added_cond_kwargs not needed now.

---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py   | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index d209fda2e1bc..9952c203d3c9 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -837,15 +837,11 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-                if image_prompt is not None:
-                    added_cond_kwargs = {"image_embeds": image_embeds}
-
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    added_cond_kwargs=added_cond_kwargs,
                     cross_attention_kwargs=self.cross_attention_kwargs,
                     return_dict=False,
                 )[0]

From 8001d244dcc44cc7d5617bc9a040bc77bdfde5fa Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 20:07:55 +0530
Subject: [PATCH 033/139] remove save_ip_adapter.

---
 src/diffusers/loaders.py | 50 ----------------------------------------
 1 file changed, 50 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index b20761ca8c94..1abd52aa89c8 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -3504,53 +3504,3 @@ def set_ip_adapter_scale(self, scale):
         for attn_processor in self.unet.attn_processors.values():
             if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
                 attn_processor.scale = scale
-
-    @classmethod
-    def save_ip_adapter(
-        cls,
-        save_directory: Union[str, os.PathLike],
-        image_projection: Union[nn.Module, Dict[str, torch.Tensor]] = None,
-        crorss_attention_modules: Dict[str, torch.Tensor] = None,
-        weight_name: str = None,
-        save_function: Callable = None,
-        safe_serialization: bool = True,
-    ):
-        # Save the model
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-
-        # Create a flat dictionary.
-        state_dict = {}
-
-        # Populate the dictionary.
-        if isinstance(image_projection, nn.Module):
-            state_dict.update({"image_proj": image_projection.state_dict()})
-        elif isinstance(image_projection, dict):
-            state_dict.update(image_projection)
-        else:
-            raise ValueError(
-                "Invalid input provided for `image_projection`. It can either be an `nn.Module` or a state dictionary."
-            )
-
-        state_dict.update({"ip_adapter": crorss_attention_modules})
-
-        if save_function is None:
-            if safe_serialization:
-
-                def save_function(weights, filename):
-                    return safetensors.torch.save_file(weights, filename, metadata={"format": "pt"})
-
-            else:
-                save_function = torch.save
-
-        os.makedirs(save_directory, exist_ok=True)
-
-        if weight_name is None:
-            if safe_serialization:
-                weight_name = IP_ADAPTER_WEIGHT_NAME_SAFE
-            else:
-                weight_name = IP_ADAPTER_WEIGHT_NAME
-
-        save_function(state_dict, os.path.join(save_directory, weight_name))
-        logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")

From 7887ba73b61af591851cbcf81073118c938d8b9d Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 20:09:43 +0530
Subject: [PATCH 034/139] remove ip adapter pipeline from the face of the earth

---
 src/diffusers/pipelines/__init__.py           |   2 -
 .../pipelines/ip_adapter/__init__.py          |   2 -
 .../pipelines/ip_adapter/image_projection.py  |  39 --
 .../ip_adapter/pipeline_ip_adapter.py         | 625 ------------------
 4 files changed, 668 deletions(-)
 delete mode 100644 src/diffusers/pipelines/ip_adapter/__init__.py
 delete mode 100644 src/diffusers/pipelines/ip_adapter/image_projection.py
 delete mode 100644 src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 9cf68f88b8eb..879bd6d98aa6 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -89,7 +89,6 @@
         "IFPipeline",
         "IFSuperResolutionPipeline",
     ]
-    _import_structure["ip_adapter"] = ["StableDiffusionIPAdapterPipeline"]
     _import_structure["kandinsky"] = [
         "KandinskyCombinedPipeline",
         "KandinskyImg2ImgCombinedPipeline",
@@ -318,7 +317,6 @@
             IFPipeline,
             IFSuperResolutionPipeline,
         )
-        from .ip_adapter import StableDiffusionIPAdapterPipeline
         from .kandinsky import (
             KandinskyCombinedPipeline,
             KandinskyImg2ImgCombinedPipeline,
diff --git a/src/diffusers/pipelines/ip_adapter/__init__.py b/src/diffusers/pipelines/ip_adapter/__init__.py
deleted file mode 100644
index 389d0cb4c0c2..000000000000
--- a/src/diffusers/pipelines/ip_adapter/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .image_projection import ImageProjectionModel
-from .pipeline_ip_adapter import StableDiffusionIPAdapterPipeline
diff --git a/src/diffusers/pipelines/ip_adapter/image_projection.py b/src/diffusers/pipelines/ip_adapter/image_projection.py
deleted file mode 100644
index e90b95214d24..000000000000
--- a/src/diffusers/pipelines/ip_adapter/image_projection.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from torch import nn
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models.modeling_utils import ModelMixin
-
-
-class ImageProjectionModel(ModelMixin, ConfigMixin):
-    """Image Projection Model."""
-
-    @register_to_config
-    def __init__(self, cross_attention_dim=768, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
-        super().__init__()
-
-        self.cross_attention_dim = cross_attention_dim
-        self.clip_extra_context_tokens = clip_extra_context_tokens
-        self.proj = nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
-        self.norm = nn.LayerNorm(cross_attention_dim)
-
-    def forward(self, image_embeds):
-        embeds = image_embeds
-        clip_extra_context_tokens = self.proj(embeds).reshape(
-            -1, self.clip_extra_context_tokens, self.cross_attention_dim
-        )
-        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
-        return clip_extra_context_tokens
diff --git a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py b/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
deleted file mode 100644
index cce7a6c2a5a9..000000000000
--- a/src/diffusers/pipelines/ip_adapter/pipeline_ip_adapter.py
+++ /dev/null
@@ -1,625 +0,0 @@
-# Copyright 2023 IP Adapter Authors and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
-
-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor,
-    AttnProcessor2_0,
-    IPAdapterAttnProcessor,
-    IPAdapterAttnProcessor2_0,
-)
-from ...models.lora import adjust_lora_scale_text_encoder
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import DIFFUSERS_CACHE, HF_HUB_OFFLINE, _get_model_file, logging
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from ..stable_diffusion import StableDiffusionPipelineOutput
-from .image_projection import ImageProjectionModel
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
-def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
-    """
-    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
-    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
-    """
-    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
-    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
-    # rescale the results from guidance (fixes overexposure)
-    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
-    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
-    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
-    return noise_cfg
-
-
-class StableDiffusionIPAdapterPipeline(DiffusionPipeline):
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        ip_adapter_image_processor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection,
-        scheduler: KarrasDiffusionSchedulers,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            vae=vae,
-            unet=unet,
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            image_encoder=image_encoder,
-            ip_adapter_image_processor=ip_adapter_image_processor,
-            scheduler=scheduler,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
-
-    def _set_ip_adapter(self):
-        unet = self.unet
-        attn_procs = {}
-        for name in unet.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = unet.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = unet.config.block_out_channels[block_id]
-            if cross_attention_dim is None:
-                attn_processor_class = (
-                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
-                )
-                attn_procs[name] = attn_processor_class()
-            else:
-                attn_processor_class = (
-                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
-                )
-                attn_procs[name] = attn_processor_class(
-                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
-                ).to(dtype=unet.dtype, device=unet.device)
-
-        unet.set_attn_processor(attn_procs)
-
-        # TODO: create a separate pipeline for this: `StableDiffusionControlNetIPAdapterPipeline`.
-        # if hasattr(self.pipeline, "controlnet"):
-        #     attn_processor_class = (
-        #         CNAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else CNAttnProcessor
-        #     )
-        #     self.pipeline.controlnet.set_attn_processor(attn_processor_class())
-
-        # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
-        r"""
-        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_slicing()
-
-    def disable_vae_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_slicing()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
-    def enable_vae_tiling(self):
-        r"""
-        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
-        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
-        processing larger images.
-        """
-        self.vae.enable_tiling()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
-    def disable_vae_tiling(self):
-        r"""
-        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_tiling()
-
-    def load_ip_adapter(
-        self,
-        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
-        **kwargs,
-    ):
-        """
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-        """
-        self._set_ip_adapter()
-
-        # Load the main state dict first/
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        weight_name = kwargs.pop("weight_name", None)
-
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch",
-        }
-
-        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
-            model_file = _get_model_file(
-                pretrained_model_name_or_path_or_dict,
-                weights_name=weight_name,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                subfolder=subfolder,
-                user_agent=user_agent,
-            )
-            state_dict = torch.load(model_file, map_location="cpu")
-        else:
-            state_dict = pretrained_model_name_or_path_or_dict
-
-        keys = list(state_dict.keys())
-        if keys != ["image_proj", "ip_adapter"]:
-            raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing.")
-
-        # Handle image projection layers.
-        clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
-        cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
-        image_projection = ImageProjectionModel(
-            cross_attention_dim=cross_attention_dim, clip_embeddings_dim=clip_embeddings_dim
-        )
-        image_projection.to(dtype=self.unet.dtype, device=self.unet.device)
-        image_projection.load_state_dict(state_dict["image_proj"])
-        self.image_projection = image_projection
-
-        # Handle IP-Adapter cross-attention layers.
-        ip_layers = torch.nn.ModuleList(
-            [
-                module if isinstance(module, nn.Module) else nn.Identity()
-                for module in self.unet.attn_processors.values()
-            ]
-        )
-        ip_layers.load_state_dict(state_dict["ip_adapter"])
-
-    def set_scale(self, scale):
-        for attn_processor in self.unet.attn_processors.values():
-            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
-                attn_processor.scale = scale
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
-    def encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        lora_scale: Optional[float] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            lora_scale (`float`, *optional*):
-                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-        """
-        # set lora scale so that monkey patched LoRA
-        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
-            self._lora_scale = lora_scale
-
-            # dynamically adjust the LoRA scale
-            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        if self.text_encoder is not None:
-            prompt_embeds_dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            prompt_embeds_dtype = self.unet.dtype
-        else:
-            prompt_embeds_dtype = prompt_embeds.dtype
-
-        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-        return prompt_embeds, negative_prompt_embeds
-
-    def encode_image(self, image, device, num_images_per_prompt):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.ip_adapter_image_processor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        (image_embeddings,) = self.image_encoder(image).image_embeds
-        image_prompt_embeds = self.image_projection(image_embeddings)
-        uncond_image_prompt_embeds = self.image_projection(torch.zeros_like(image_embeddings))
-
-        # duplicate image embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = image_prompt_embeds.shape
-        image_prompt_embeds = image_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        return image_prompt_embeds, uncond_image_prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: PipelineImageInput = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        guidance_rescale: float = 0.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        ip_adapter_scale: float = 1.0,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
-    ):
-        # 0. Set IP Adapter scale
-        self.set_scale(ip_adapter_scale)
-
-        # 1. Check inputs and raise error if needed.
-        if hasattr(self, "image_projection") and getattr(self, "image_projection") is None:
-            raise (
-                "This pipeline cannot be called without having an `image_projection` module. Did you call `load_ip_adapter()` before running the pipeline?"
-            )
-        # TODO
-
-        # 1. Define call parameters
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        device = self._execution_device
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 2. Encode input image
-        image_embeddings, uncond_image_embeddings = self.encode_image(image, device, num_images_per_prompt)
-
-        # 3. Encode prompt
-        text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-        )
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-        )
-        prompt_embeds = torch.cat([prompt_embeds, image_embeddings], dim=1)
-        negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_embeddings], dim=1)
-
-        # For classifier free guidance, we need to do two forward passes.
-        # Here we concatenate the unconditional and text embeddings into a single batch
-        # to avoid doing two forward passes
-        if do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    return_dict=False,
-                )[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                if do_classifier_free_guidance and guidance_rescale > 0.0:
-                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if not output_type == "latent":
-            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
-            has_nsfw_concept = None
-            # image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        # # Offload last model to CPU
-        # TODO
-        # if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-        #     self.final_offload_hook.offload()
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

From 46c668b36c5bd48907659f1ec2a1c0500734bd11 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 20:12:32 +0530
Subject: [PATCH 035/139] refactor __call__

---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 9952c203d3c9..f130f5fae8e0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -637,7 +637,6 @@ def num_timesteps(self):
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        image_prompt: PipelineImageInput = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 50,
@@ -649,6 +648,7 @@ def __call__(
         latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        image_prompt: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -695,6 +695,7 @@ def __call__(
             negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            image_prompt: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated image. Choose between `PIL.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):

From 6e28231df0d7d3cf42b9046cd234b41abd4ade0f Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 20:16:38 +0530
Subject: [PATCH 036/139] fix init.

---
 src/diffusers/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 5f95241830ed..4291e911ac74 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -253,7 +253,6 @@
             "StableDiffusionInpaintPipeline",
             "StableDiffusionInpaintPipelineLegacy",
             "StableDiffusionInstructPix2PixPipeline",
-            "StableDiffusionIPAdapterPipeline",
             "StableDiffusionLatentUpscalePipeline",
             "StableDiffusionLDM3DPipeline",
             "StableDiffusionModelEditingPipeline",
@@ -599,7 +598,6 @@
             StableDiffusionInpaintPipeline,
             StableDiffusionInpaintPipelineLegacy,
             StableDiffusionInstructPix2PixPipeline,
-            StableDiffusionIPAdapterPipeline,
             StableDiffusionLatentUpscalePipeline,
             StableDiffusionLDM3DPipeline,
             StableDiffusionModelEditingPipeline,

From ef937be47995d7048e69e8fa9d82ca3251495b97 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 20:41:08 +0530
Subject: [PATCH 037/139] remove none

---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index f130f5fae8e0..a88c47145f3a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -119,7 +119,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection = None,
+        image_encoder: CLIPVisionModelWithProjection,
         requires_safety_checker: bool = True,
     ):
         super().__init__()

From 704344362e9564daec1076a32362cebc8ab469f4 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 20:41:43 +0530
Subject: [PATCH 038/139] image_encoder

---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index a88c47145f3a..271df918fc1f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -196,7 +196,7 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
+            # image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)

From d7e390fcef8223fd2a2d0e22b1d810967f245b66 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 20:42:52 +0530
Subject: [PATCH 039/139] module registration

---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 271df918fc1f..a88c47145f3a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -196,7 +196,7 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
-            # image_encoder=image_encoder,
+            image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)

From 86b0e4a1cc5485359aa78b09c3f5cf55b56a5faa Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 8 Nov 2023 20:55:27 +0530
Subject: [PATCH 040/139] does defaulting to None work for modules?

---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index a88c47145f3a..f130f5fae8e0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -119,7 +119,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection,
+        image_encoder: CLIPVisionModelWithProjection = None,
         requires_safety_checker: bool = True,
     ):
         super().__init__()

From d0cf0cc140ffd142fc01f2e84bf912af09e907db Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 8 Nov 2023 20:38:56 +0000
Subject: [PATCH 041/139] unet

---
 src/diffusers/loaders.py                      |  5 ++-
 src/diffusers/models/unet_2d_condition.py     | 10 ++++-
 .../pipeline_stable_diffusion.py              | 39 +++++++------------
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 1abd52aa89c8..430719b826e9 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -3489,7 +3489,10 @@ def load_ip_adapter(
         )
 
         image_projection.load_state_dict(diffusers_state_dict)
-        self.image_projection = image_projection.to(device=self.unet.device, dtype=self.unet.dtype)
+
+        self.unet.encoder_hid_proj = image_projection.to(device=self.unet.device, dtype=self.unet.dtype)
+        self.unet.config.encoder_hid_dim_type = "image_proj"
+        self.unet.config.encoder_hid_dim = clip_embeddings_dim
 
         # Handle IP-Adapter cross-attention layers.
         ip_layers = torch.nn.ModuleList(
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index da1f91280a11..e192d145e74c 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -1015,13 +1015,19 @@ def forward(
             image_embeds = added_cond_kwargs.get("image_embeds")
             encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
         elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
-            # Kandinsky 2.2 - style
             if "image_embeds" not in added_cond_kwargs:
                 raise ValueError(
                     f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
                 )
             image_embeds = added_cond_kwargs.get("image_embeds")
-            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+            image_embeds = image_embeds.to(encoder_hidden_states.dtype)
+            image_embeds = self.encoder_hid_proj(image_embeds)
+            # IP-adapter
+            if any("to_k_ip" in k for k in self.state_dict().keys()):
+                encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
+            else:
+            # Kandinsky 2.2 - style
+                encoder_hidden_states = image_embeds
 
         # 2. pre-process
         sample = self.conv_in(sample)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index f130f5fae8e0..9b6565247dbe 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -444,9 +444,7 @@ def encode_prompt(
 
         return prompt_embeds, negative_prompt_embeds
 
-    # Note (sayakpaul): Name it this way to not mess up with other functions like _encode_image()
-    # common in imag2image pipelines.
-    def encode_image_ip_adapter(self, image, device, num_images_per_prompt):
+    def encode_image(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
 
         if not isinstance(image, torch.Tensor):
@@ -454,19 +452,10 @@ def encode_image_ip_adapter(self, image, device, num_images_per_prompt):
 
         image = image.to(device=device, dtype=dtype)
         image_embeds = self.image_encoder(image).image_embeds
-        projected_image_embeds = self.image_projection(image_embeds)
-        uncond_projected_image_embeds = self.image_projection(torch.zeros_like(image_embeds))
-
-        # duplicate image embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = projected_image_embeds.shape
-        projected_image_embeds = projected_image_embeds.repeat(1, num_images_per_prompt, 1)
-        projected_image_embeds = projected_image_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-        uncond_projected_image_embeds.repeat(1, num_images_per_prompt, 1)
-        uncond_projected_image_embeds = uncond_projected_image_embeds.view(
-            bs_embed * num_images_per_prompt, seq_len, -1
-        )
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
-        return projected_image_embeds, uncond_projected_image_embeds
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
 
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is None:
@@ -648,7 +637,7 @@ def __call__(
         latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        image_prompt: Optional[PipelineImageInput] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -695,7 +684,7 @@ def __call__(
             negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            image_prompt: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated image. Choose between `PIL.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
@@ -796,18 +785,16 @@ def __call__(
             clip_skip=self.clip_skip,
         )
 
-        if image_prompt is not None:
-            image_embeds, negative_image_embeds = self.encode_image_ip_adapter(
-                image_prompt, device, num_images_per_prompt
-            )
-            prompt_embeds = torch.cat([prompt_embeds, image_embeds], dim=1)
-            negative_prompt_embeds = torch.cat([negative_prompt_embeds, negative_image_embeds], dim=1)
-
         # For classifier free guidance, we need to do two forward passes.
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
         if self.do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -838,12 +825,16 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
+                if ip_adapter_image is not None:
+                    added_cond_kwargs = {"image_embeds": image_embeds}
+
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
                     cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
                     return_dict=False,
                 )[0]
 

From f2431b3efc781144e2565207f2cb0d3b4faf7b33 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 8 Nov 2023 21:23:43 +0000
Subject: [PATCH 042/139] style

---
 src/diffusers/models/unet_2d_condition.py                     | 2 +-
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py   | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index e192d145e74c..362bc7de1ffb 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -1026,7 +1026,7 @@ def forward(
             if any("to_k_ip" in k for k in self.state_dict().keys()):
                 encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
             else:
-            # Kandinsky 2.2 - style
+                # Kandinsky 2.2 - style
                 encoder_hidden_states = image_embeds
 
         # 2. pre-process
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 9b6565247dbe..c9cec1aaa0eb 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -790,7 +790,7 @@ def __call__(
         # to avoid doing two forward passes
         if self.do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-        
+
         if ip_adapter_image is not None:
             image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
             if self.do_classifier_free_guidance:
@@ -827,6 +827,8 @@ def __call__(
 
                 if ip_adapter_image is not None:
                     added_cond_kwargs = {"image_embeds": image_embeds}
+                else:
+                    added_cond_kwargs = None
 
                 # predict the noise residual
                 noise_pred = self.unet(

From 7fdbf86a74b4afaa57e9acce59485b4f0daf6860 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 8 Nov 2023 21:51:10 +0000
Subject: [PATCH 043/139] fix a test

---
 src/diffusers/models/unet_2d_condition.py                       | 2 +-
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 362bc7de1ffb..a77d53b98603 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -1020,10 +1020,10 @@ def forward(
                     f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
                 )
             image_embeds = added_cond_kwargs.get("image_embeds")
-            image_embeds = image_embeds.to(encoder_hidden_states.dtype)
             image_embeds = self.encoder_hid_proj(image_embeds)
             # IP-adapter
             if any("to_k_ip" in k for k in self.state_dict().keys()):
+                image_embeds = image_embeds.to(encoder_hidden_states.dtype)
                 encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
             else:
                 # Kandinsky 2.2 - style
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index c9cec1aaa0eb..e4bcc3a66bcc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -119,7 +119,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection = None,
+        image_encoder: CLIPVisionModelWithProjection,
         requires_safety_checker: bool = True,
     ):
         super().__init__()

From ba43e033e434cf7c77e1b855365aa91a8ed042b7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 8 Nov 2023 22:54:17 +0000
Subject: [PATCH 044/139] attemp to fix image_encoder none test

---
 tests/lora/test_lora_layers_peft.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/lora/test_lora_layers_peft.py b/tests/lora/test_lora_layers_peft.py
index 6217d1cd28cd..b5d855bbd602 100644
--- a/tests/lora/test_lora_layers_peft.py
+++ b/tests/lora/test_lora_layers_peft.py
@@ -149,6 +149,7 @@ def get_dummy_components(self):
                 "tokenizer": tokenizer,
                 "safety_checker": None,
                 "feature_extractor": None,
+                "image_encoder": None,
             }
         lora_components = {
             "unet_lora_layers": unet_lora_layers,

From 1d2b58b7d2e86f3f04a183eba1d2722f913bd085 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 8 Nov 2023 23:36:35 +0000
Subject: [PATCH 045/139] sdxl

---
 .../pipeline_stable_diffusion_xl.py           | 39 +++++++++++++++++--
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 6143d2210c3c..998c29574683 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -16,11 +16,12 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
-from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
 
-from ...image_processor import VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import (
     FromSingleFileMixin,
+    IPAdapterMixin,
     StableDiffusionXLLoraLoaderMixin,
     TextualInversionLoaderMixin,
 )
@@ -94,7 +95,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
 
 
 class StableDiffusionXLPipeline(
-    DiffusionPipeline, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+    DiffusionPipeline,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -141,7 +146,7 @@ class StableDiffusionXLPipeline(
             watermarker will be used.
     """
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
-    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
+    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2", "image_encoder"]
     _callback_tensor_inputs = [
         "latents",
         "prompt_embeds",
@@ -161,6 +166,7 @@ def __init__(
         tokenizer_2: CLIPTokenizer,
         unet: UNet2DConditionModel,
         scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection,
         force_zeros_for_empty_prompt: bool = True,
         add_watermarker: Optional[bool] = None,
     ):
@@ -174,6 +180,7 @@ def __init__(
             tokenizer_2=tokenizer_2,
             unet=unet,
             scheduler=scheduler,
+            image_encoder=image_encoder,
         )
         self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
@@ -455,6 +462,20 @@ def encode_prompt(
 
         return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
     def prepare_extra_step_kwargs(self, generator, eta):
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -688,6 +709,7 @@ def __call__(
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -772,6 +794,7 @@ def __call__(
                 Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                 weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                 input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generate image. Choose between
                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -970,6 +993,12 @@ def __call__(
         add_text_embeds = add_text_embeds.to(device)
         add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
 
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+                image_embeds = image_embeds.to(device)
+
         # 8. Denoising loop
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
 
@@ -999,6 +1028,8 @@ def __call__(
 
                 # predict the noise residual
                 added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
                 noise_pred = self.unet(
                     latent_model_input,
                     t,

From 6c0106b7327ebb98e415039686304e0eeacd1f1f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Nov 2023 00:11:55 +0000
Subject: [PATCH 046/139] sd img2img

---
 .../pipeline_stable_diffusion_img2img.py      | 38 +++++++++++++++++--
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 583e6046b2e1..5d2f5a2840e5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -19,11 +19,11 @@
 import PIL.Image
 import torch
 from packaging import version
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...configuration_utils import FrozenDict
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
@@ -106,7 +106,7 @@ def preprocess(image):
 
 
 class StableDiffusionImg2ImgPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
 ):
     r"""
     Pipeline for text-guided image-to-image generation using Stable Diffusion.
@@ -119,6 +119,7 @@ class StableDiffusionImg2ImgPipeline(
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
     Args:
         vae ([`AutoencoderKL`]):
@@ -140,7 +141,7 @@ class StableDiffusionImg2ImgPipeline(
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor"]
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
     _exclude_from_cpu_offload = ["safety_checker"]
     _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
 
@@ -153,6 +154,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
         requires_safety_checker: bool = True,
     ):
         super().__init__()
@@ -229,6 +231,7 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -449,6 +452,20 @@ def encode_prompt(
 
         return prompt_embeds, negative_prompt_embeds
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is None:
@@ -678,6 +695,7 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -727,6 +745,7 @@ def __call__(
             negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated image. Choose between `PIL.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
@@ -819,6 +838,11 @@ def __call__(
         if self.do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
         # 4. Preprocess image
         image = self.image_processor.preprocess(image)
 
@@ -850,12 +874,18 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
+                if ip_adapter_image is not None:
+                    added_cond_kwargs = {"image_embeds": image_embeds}
+                else:
+                    added_cond_kwargs = None
+
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
                     cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
                     return_dict=False,
                 )[0]
 

From 84bcbd6785eed53e0238f21f910ef31d09fd3457 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Nov 2023 00:48:57 +0000
Subject: [PATCH 047/139] inpaint

---
 .../pipeline_stable_diffusion_inpaint.py      | 38 +++++++++++++++++--
 tests/lora/test_lora_layers_peft.py           |  1 +
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index ca7d62fd5077..04f886e55bec 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -19,11 +19,11 @@
 import PIL.Image
 import torch
 from packaging import version
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...configuration_utils import FrozenDict
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AsymmetricAutoencoderKL, AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
@@ -170,7 +170,7 @@ def retrieve_latents(encoder_output, generator):
 
 
 class StableDiffusionInpaintPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
 ):
     r"""
     Pipeline for text-guided image inpainting using Stable Diffusion.
@@ -182,6 +182,7 @@ class StableDiffusionInpaintPipeline(
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
     Args:
         vae ([`AutoencoderKL`, `AsymmetricAutoencoderKL`]):
@@ -203,7 +204,7 @@ class StableDiffusionInpaintPipeline(
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor"]
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
     _exclude_from_cpu_offload = ["safety_checker"]
     _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "mask", "masked_image_latents"]
 
@@ -216,6 +217,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
         requires_safety_checker: bool = True,
     ):
         super().__init__()
@@ -297,6 +299,7 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -520,6 +523,20 @@ def encode_prompt(
 
         return prompt_embeds, negative_prompt_embeds
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is None:
@@ -807,6 +824,7 @@ def __call__(
         latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -872,6 +890,7 @@ def __call__(
             negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated image. Choose between `PIL.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
@@ -999,6 +1018,11 @@ def __call__(
         if self.do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
         # 4. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps, num_inference_steps = self.get_timesteps(
@@ -1101,12 +1125,18 @@ def __call__(
                 if num_channels_unet == 9:
                     latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
 
+                if ip_adapter_image is not None:
+                    added_cond_kwargs = {"image_embeds": image_embeds}
+                else:
+                    added_cond_kwargs = None
+
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
                     cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
                     return_dict=False,
                 )[0]
 
diff --git a/tests/lora/test_lora_layers_peft.py b/tests/lora/test_lora_layers_peft.py
index b5d855bbd602..e874b5ebcaa6 100644
--- a/tests/lora/test_lora_layers_peft.py
+++ b/tests/lora/test_lora_layers_peft.py
@@ -139,6 +139,7 @@ def get_dummy_components(self):
                 "tokenizer": tokenizer,
                 "text_encoder_2": text_encoder_2,
                 "tokenizer_2": tokenizer_2,
+                "image_encoder": None,
             }
         else:
             pipeline_components = {

From 9b8b11ab546bb8918041330a8aa4a6199b2d5da6 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Nov 2023 01:06:04 +0000
Subject: [PATCH 048/139] another attemp to fix lora test

---
 tests/lora/test_lora_layers_old_backend.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/lora/test_lora_layers_old_backend.py b/tests/lora/test_lora_layers_old_backend.py
index 047cdddfa95a..594d88803591 100644
--- a/tests/lora/test_lora_layers_old_backend.py
+++ b/tests/lora/test_lora_layers_old_backend.py
@@ -245,6 +245,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None,
         }
         lora_components = {
             "unet_lora_layers": unet_lora_layers,

From d662f6ce3effe1eb2b6f0075332f4a1ab3d8b0f0 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Nov 2023 01:24:04 +0000
Subject: [PATCH 049/139] fix more tests

---
 tests/lora/test_lora_layers_old_backend.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/lora/test_lora_layers_old_backend.py b/tests/lora/test_lora_layers_old_backend.py
index 594d88803591..8271f365d20e 100644
--- a/tests/lora/test_lora_layers_old_backend.py
+++ b/tests/lora/test_lora_layers_old_backend.py
@@ -757,6 +757,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 
@@ -866,6 +867,7 @@ def get_dummy_components(self):
             "text_encoder_2": text_encoder_2,
             "tokenizer": tokenizer,
             "tokenizer_2": tokenizer_2,
+            "image_encoder": None,
         }
         lora_components = {
             "unet_lora_layers": unet_lora_layers,

From a77b1e5a285f3923df8e9a8e6c5e024ab1deb16c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Nov 2023 01:41:59 +0000
Subject: [PATCH 050/139] more image_encoder: none

---
 tests/pipelines/test_pipelines_common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index b9fe4d190f23..aff91a9589a6 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1043,6 +1043,7 @@ def get_pipeline_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 

From ecb2a5fd05a0ac21bf1ba9bb1f99fbb8a4e8f9b7 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 08:14:23 +0530
Subject: [PATCH 051/139] add to sdxl inpainting

---
 .../pipeline_stable_diffusion_xl_inpaint.py   | 34 ++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 7890774c7539..58c6032212bd 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -18,10 +18,10 @@
 import numpy as np
 import PIL.Image
 import torch
-from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin, IPAdapterMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
@@ -249,7 +249,7 @@ def retrieve_latents(encoder_output, generator):
 
 
 class StableDiffusionXLInpaintPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, IPAdapterMixin,
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -300,7 +300,7 @@ class StableDiffusionXLInpaintPipeline(
     """
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
 
-    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
+    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2", "image_encoder"]
     _callback_tensor_inputs = [
         "latents",
         "prompt_embeds",
@@ -321,6 +321,7 @@ def __init__(
         tokenizer: CLIPTokenizer,
         tokenizer_2: CLIPTokenizer,
         unet: UNet2DConditionModel,
+        image_encoder: CLIPVisionModelWithProjection,
         scheduler: KarrasDiffusionSchedulers,
         requires_aesthetics_score: bool = False,
         force_zeros_for_empty_prompt: bool = True,
@@ -335,6 +336,7 @@ def __init__(
             tokenizer=tokenizer,
             tokenizer_2=tokenizer_2,
             unet=unet,
+            image_encoder=image_encoder,
             scheduler=scheduler,
         )
         self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
@@ -385,6 +387,20 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
     def encode_prompt(
         self,
@@ -1043,6 +1059,7 @@ def __call__(
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -1141,6 +1158,7 @@ def __call__(
                 Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                 weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                 input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             eta (`float`, *optional*, defaults to 0.0):
@@ -1440,6 +1458,12 @@ def denoising_value_valid(dnv):
         add_text_embeds = add_text_embeds.to(device)
         add_time_ids = add_time_ids.to(device)
 
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+                image_embeds = image_embeds.to(device)
+
         # 11. Denoising loop
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
 
@@ -1478,6 +1502,8 @@ def denoising_value_valid(dnv):
 
                 # predict the noise residual
                 added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
                 noise_pred = self.unet(
                     latent_model_input,
                     t,

From c0042c11bb0a3213560ca88d7ad8f1b388e32704 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 08:25:57 +0530
Subject: [PATCH 052/139] add to sdxl image-to-image

---
 .../pipeline_stable_diffusion_xl_img2img.py   | 34 ++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 02a220fa851b..9fc253c75e91 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -17,10 +17,10 @@
 
 import PIL.Image
 import torch
-from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin, IPAdapterMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
@@ -104,7 +104,7 @@ def retrieve_latents(encoder_output, generator):
 
 
 class StableDiffusionXLImg2ImgPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, IPAdapterMixin
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -154,7 +154,7 @@ class StableDiffusionXLImg2ImgPipeline(
             watermarker will be used.
     """
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
-    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
+    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2", "image_encoder"]
     _callback_tensor_inputs = [
         "latents",
         "prompt_embeds",
@@ -173,6 +173,7 @@ def __init__(
         tokenizer: CLIPTokenizer,
         tokenizer_2: CLIPTokenizer,
         unet: UNet2DConditionModel,
+        image_encoder: CLIPVisionModelWithProjection,
         scheduler: KarrasDiffusionSchedulers,
         requires_aesthetics_score: bool = False,
         force_zeros_for_empty_prompt: bool = True,
@@ -187,6 +188,7 @@ def __init__(
             tokenizer=tokenizer,
             tokenizer_2=tokenizer_2,
             unet=unet,
+            image_encoder=image_encoder,
             scheduler=scheduler,
         )
         self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
@@ -663,6 +665,20 @@ def prepare_latents(
         latents = init_latents
 
         return latents
+    
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds    
 
     def _get_add_time_ids(
         self,
@@ -820,6 +836,7 @@ def __call__(
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -913,6 +930,7 @@ def __call__(
                 Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                 weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                 input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generate image. Choose between
                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -1132,6 +1150,12 @@ def denoising_value_valid(dnv):
         add_text_embeds = add_text_embeds.to(device)
         add_time_ids = add_time_ids.to(device)
 
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+                image_embeds = image_embeds.to(device)
+
         # 9. Denoising loop
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
 
@@ -1166,6 +1190,8 @@ def denoising_value_valid(dnv):
 
                 # predict the noise residual
                 added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
                 noise_pred = self.unet(
                     latent_model_input,
                     t,

From 44eb0344eae808f118cf0628d14cdd8454e75b9e Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 08:27:01 +0530
Subject: [PATCH 053/139] stylw

---
 .../pipeline_stable_diffusion_xl_img2img.py     | 17 +++++++++++++----
 .../pipeline_stable_diffusion_xl_inpaint.py     | 13 +++++++++++--
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 9fc253c75e91..2eeb8f13491c 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -20,7 +20,12 @@
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin, IPAdapterMixin
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
@@ -104,7 +109,11 @@ def retrieve_latents(encoder_output, generator):
 
 
 class StableDiffusionXLImg2ImgPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, IPAdapterMixin
+    DiffusionPipeline,
+    TextualInversionLoaderMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    IPAdapterMixin,
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -665,7 +674,7 @@ def prepare_latents(
         latents = init_latents
 
         return latents
-    
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.encode_image
     def encode_image(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
@@ -678,7 +687,7 @@ def encode_image(self, image, device, num_images_per_prompt):
         image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
         uncond_image_embeds = torch.zeros_like(image_embeds)
-        return image_embeds, uncond_image_embeds    
+        return image_embeds, uncond_image_embeds
 
     def _get_add_time_ids(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 58c6032212bd..8093aee858ae 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -21,7 +21,12 @@
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin, IPAdapterMixin
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
@@ -249,7 +254,11 @@ def retrieve_latents(encoder_output, generator):
 
 
 class StableDiffusionXLInpaintPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, IPAdapterMixin,
+    DiffusionPipeline,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    FromSingleFileMixin,
+    IPAdapterMixin,
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion XL.

From 0f1e364e0cf576098713330124185dfb6df67aae Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 08:44:37 +0530
Subject: [PATCH 054/139] style

---
 .../pipeline_stable_diffusion_xl.py           | 19 +++++++++++++++++--
 .../pipeline_stable_diffusion_xl_img2img.py   | 10 +++++++++-
 .../pipeline_stable_diffusion_xl_inpaint.py   | 10 +++++++++-
 3 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 998c29574683..524b7b3d8ffb 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -16,7 +16,13 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
-from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import (
@@ -146,7 +152,14 @@ class StableDiffusionXLPipeline(
             watermarker will be used.
     """
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
-    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2", "image_encoder"]
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
     _callback_tensor_inputs = [
         "latents",
         "prompt_embeds",
@@ -167,6 +180,7 @@ def __init__(
         unet: UNet2DConditionModel,
         scheduler: KarrasDiffusionSchedulers,
         image_encoder: CLIPVisionModelWithProjection,
+        feature_extractor: CLIPImageProcessor,
         force_zeros_for_empty_prompt: bool = True,
         add_watermarker: Optional[bool] = None,
     ):
@@ -181,6 +195,7 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
         )
         self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 2eeb8f13491c..3e7add32cde3 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -17,7 +17,13 @@
 
 import PIL.Image
 import torch
-from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import (
@@ -183,6 +189,7 @@ def __init__(
         tokenizer_2: CLIPTokenizer,
         unet: UNet2DConditionModel,
         image_encoder: CLIPVisionModelWithProjection,
+        feature_extractor: CLIPImageProcessor,
         scheduler: KarrasDiffusionSchedulers,
         requires_aesthetics_score: bool = False,
         force_zeros_for_empty_prompt: bool = True,
@@ -198,6 +205,7 @@ def __init__(
             tokenizer_2=tokenizer_2,
             unet=unet,
             image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
             scheduler=scheduler,
         )
         self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 8093aee858ae..86e751d1cbfb 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -18,7 +18,13 @@
 import numpy as np
 import PIL.Image
 import torch
-from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import (
@@ -331,6 +337,7 @@ def __init__(
         tokenizer_2: CLIPTokenizer,
         unet: UNet2DConditionModel,
         image_encoder: CLIPVisionModelWithProjection,
+        feature_extractor: CLIPImageProcessor,
         scheduler: KarrasDiffusionSchedulers,
         requires_aesthetics_score: bool = False,
         force_zeros_for_empty_prompt: bool = True,
@@ -346,6 +353,7 @@ def __init__(
             tokenizer_2=tokenizer_2,
             unet=unet,
             image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
             scheduler=scheduler,
         )
         self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)

From b2f7af0f63c15530d76900e7e9de554da8c7f72d Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 08:47:35 +0530
Subject: [PATCH 055/139] feat: safetensors loading.

---
 src/diffusers/loaders.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 430719b826e9..79a15729cc6a 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -82,9 +82,6 @@
 CUSTOM_DIFFUSION_WEIGHT_NAME = "pytorch_custom_diffusion_weights.bin"
 CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE = "pytorch_custom_diffusion_weights.safetensors"
 
-IP_ADAPTER_WEIGHT_NAME = "pytorch_ip_adapter_weights.bin"
-IP_ADAPTER_WEIGHT_NAME_SAFE = "pytorch_ip_adapter_weights.safetensors"
-
 LORA_DEPRECATION_MESSAGE = "You are using an old version of LoRA backend. This will be deprecated in the next releases in favor of PEFT make sure to install the latest PEFT and transformers packages in the future."
 
 
@@ -3439,7 +3436,6 @@ def load_ip_adapter(
         revision = kwargs.pop("revision", None)
         subfolder = kwargs.pop("subfolder", None)
         weight_name = kwargs.pop("weight_name", None)
-        # TODO (sayakpaul): incorporate safetensors
 
         user_agent = {
             "file_type": "attn_procs_weights",
@@ -3460,7 +3456,10 @@ def load_ip_adapter(
                 subfolder=subfolder,
                 user_agent=user_agent,
             )
-            state_dict = torch.load(model_file, map_location="cpu")
+            if weight_name.endswith(".safetensors"):
+                state_dict = safetensors.torch.load_file(model_file, device="cpu")
+            else:
+                state_dict = torch.load(model_file, map_location="cpu")
         else:
             state_dict = pretrained_model_name_or_path_or_dict
 

From 6af211239edc8474953711a5b58e98f4234db70f Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 09:47:53 +0530
Subject: [PATCH 056/139] fix: tests.

---
 tests/lora/test_lora_layers_old_backend.py                      | 1 +
 tests/lora/test_lora_layers_peft.py                             | 1 +
 tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py | 2 ++
 .../stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py     | 2 ++
 4 files changed, 6 insertions(+)

diff --git a/tests/lora/test_lora_layers_old_backend.py b/tests/lora/test_lora_layers_old_backend.py
index 8271f365d20e..6a432718e39c 100644
--- a/tests/lora/test_lora_layers_old_backend.py
+++ b/tests/lora/test_lora_layers_old_backend.py
@@ -868,6 +868,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "tokenizer_2": tokenizer_2,
             "image_encoder": None,
+            "feature_extractor": None,
         }
         lora_components = {
             "unet_lora_layers": unet_lora_layers,
diff --git a/tests/lora/test_lora_layers_peft.py b/tests/lora/test_lora_layers_peft.py
index e874b5ebcaa6..ddb11389dbb1 100644
--- a/tests/lora/test_lora_layers_peft.py
+++ b/tests/lora/test_lora_layers_peft.py
@@ -140,6 +140,7 @@ def get_dummy_components(self):
                 "text_encoder_2": text_encoder_2,
                 "tokenizer_2": tokenizer_2,
                 "image_encoder": None,
+                "feature_extractor": None,
             }
         else:
             pipeline_components = {
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
index d2d00d9c0110..16a659fac6a8 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
@@ -122,6 +122,8 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "text_encoder_2": text_encoder_2,
             "tokenizer_2": tokenizer_2,
+            "image_encoder": None,
+            "feature_extractor": None,
         }
         return components
 
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
index aa607c23ffda..b12d8b2e9b08 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
@@ -126,6 +126,8 @@ def get_dummy_components(self, skip_first_text_encoder=False):
             "tokenizer": tokenizer if not skip_first_text_encoder else None,
             "text_encoder_2": text_encoder_2,
             "tokenizer_2": tokenizer_2,
+            "image_encoder": None,
+            "feature_extractor": None,
             "requires_aesthetics_score": True,
         }
         return components

From 88efe676ed9fb32ec0d001688e322c48e5e05336 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 10:23:37 +0530
Subject: [PATCH 057/139] fix more tests

---
 .../stable_diffusion/test_stable_diffusion_img2img.py         | 1 +
 .../stable_diffusion/test_stable_diffusion_model_editing.py   | 1 +
 tests/pipelines/stable_diffusion_2/test_stable_diffusion.py   | 1 +
 .../stable_diffusion_xl/test_stable_diffusion_xl_img2img.py   | 4 ++++
 4 files changed, 7 insertions(+)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 9e365e860f0e..a7498c06c2b7 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -148,6 +148,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
index 27c6a65b6395..ad017f2241b2 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
@@ -92,6 +92,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
index 4414d1ec5075..ed295f792f99 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -123,6 +123,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
index c3fb397956fa..699b60646549 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -123,6 +123,8 @@ def get_dummy_components(self, skip_first_text_encoder=False):
             "text_encoder_2": text_encoder_2,
             "tokenizer_2": tokenizer_2,
             "requires_aesthetics_score": True,
+            "image_encoder": None,
+            "feature_extractor": None,
         }
         return components
 
@@ -438,6 +440,8 @@ def get_dummy_components(self):
             "text_encoder_2": text_encoder_2,
             "tokenizer_2": tokenizer_2,
             "requires_aesthetics_score": True,
+            "image_encoder": None,
+            "feature_extractor": None,
         }
         return components
 

From 5baa910aeaa20df96319dd2eeb61d48663e9da6c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Nov 2023 04:56:35 +0000
Subject: [PATCH 058/139] doc

---
 .../en/using-diffusers/loading_adapters.md    | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md
index 8f6bf85da318..0d4ed7d72f66 100644
--- a/docs/source/en/using-diffusers/loading_adapters.md
+++ b/docs/source/en/using-diffusers/loading_adapters.md
@@ -307,3 +307,74 @@ prompt = "a house by william eggleston, sunrays, beautiful, sunlight, sunrays, b
 image = pipeline(prompt=prompt).images[0]
 image
 ```
+
+### IP-Adapter 
+
+[IP-Adapter](https://ip-adapter.github.io/) is an effective and lightweight adapter to achieve image prompt capability for the pre-trained text-to-image diffusion models. It is now available to use with most of our Stable Diffusion and Stable Diffusion XL pipelines. You can also use the IP-Adapter with other custom models fine-tuned from the same base model, as well as ControlNet and T2I adapters. Moreover, the image prompt can also work well with the text prompt to accomplish multimodal image generation.
+
+Let's look at an example where we use IP-Adapter with the Stable Diffusion text-to-image pipeline.
+
+``` py
+from diffusers import AutoPipelineForText2Image
+import torch
+from diffusers.utils import load_image
+
+pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
+```
+
+Now you can load the IP-Adapter with [`~loaders.IPAdapterMixin.load_ip_adapter`] method. 
+
+```py
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+```
+
+IP-Adapter allows you to use both image and text to condition the image generation process. In this example, let's take the cute bear eating pizza that we generated with Textual Inversion, and create a new bear that is similarly cute but wears sunglasses. We can pass the bear image as `ip_adapter_image`, along with a text prompt that mentions "sunglasses". 
+
+```py
+pipeline.set_ip_adapter_scale(0.6)
+image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png")
+generator = torch.Generator(device="cpu").manual_seed(33)
+images = pipeline(
+    prompt='best quality, high quality, wearing sunglasses', 
+    ip_adapter_image=image,
+    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
+    num_inference_steps=50,
+    generator=generator,
+).images
+images[0]
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip-bear.png" />
+</div>
+
+<Tip>
+
+You can use the `pipeline.set_ip_adapter_scale()` method to adjust the ratio of text prompt and image prompt condition.  If you only use the image prompt, you should set the scale to be `1.0`. You can lower the scale to get more diversity in the generation, at the cost of less prompt alignment.
+`scale=0.5` can achieve good results in most cases when you use both text and image prompts.
+</Tip>
+
+IP-Adapter also works great with Image-to-Image and Inpainting pipelines. Here is an example of how you can use it with Image-to-Image.
+
+```py
+from diffusers import AutoPipelineForImage2Image
+import torch
+from diffusers.utils import load_image
+
+pipeline = AutoPipelineForImage2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
+
+image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg")
+ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/river.png")
+
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+generator = torch.Generator(device="cpu").manual_seed(33)
+images = pipeline(
+    prompt='best quality, high quality', 
+    image = image,
+    ip_adapter_image=ip_image,
+    num_inference_steps=50,
+    generator=generator,
+    strength=0.6,
+).images
+images[0]
+```
\ No newline at end of file

From abc137297fe78278a9e75942044193cf83308323 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Nov 2023 06:01:36 +0000
Subject: [PATCH 059/139] fix sdxl img2img + inpaint tests

---
 .../pipeline_stable_diffusion_xl_img2img.py   |  2 +-
 .../pipeline_stable_diffusion_xl_inpaint.py   |  2 +-
 .../test_stable_diffusion_xl_img2img.py       | 32 +++++++++++++++++--
 .../test_stable_diffusion_xl_inpaint.py       | 31 ++++++++++++++++--
 4 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 3e7add32cde3..faa37c7828f4 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -169,7 +169,7 @@ class StableDiffusionXLImg2ImgPipeline(
             watermarker will be used.
     """
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
-    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2", "image_encoder"]
+    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2", "image_encoder", "feature_extractor"]
     _callback_tensor_inputs = [
         "latents",
         "prompt_embeds",
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 86e751d1cbfb..e00cf6a4d125 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -315,7 +315,7 @@ class StableDiffusionXLInpaintPipeline(
     """
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
 
-    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2", "image_encoder"]
+    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2", "image_encoder", "feature_extractor"]
     _callback_tensor_inputs = [
         "latents",
         "prompt_embeds",
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
index 699b60646549..ea5ae1c0483e 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionConfig, CLIPVisionModelWithProjection, CLIPImageProcessor
 
 from diffusers import (
     AutoencoderKL,
@@ -114,6 +114,32 @@ def get_dummy_components(self, skip_first_text_encoder=False):
         text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
         tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
+        torch.manual_seed(0)
+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=32,
+            image_size=224,
+            projection_dim=32,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+        
+        feature_extractor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -123,8 +149,8 @@ def get_dummy_components(self, skip_first_text_encoder=False):
             "text_encoder_2": text_encoder_2,
             "tokenizer_2": tokenizer_2,
             "requires_aesthetics_score": True,
-            "image_encoder": None,
-            "feature_extractor": None,
+            "image_encoder": image_encoder,
+            "feature_extractor": feature_extractor,
         }
         return components
 
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
index b12d8b2e9b08..6f8154c3f16c 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
@@ -20,7 +20,7 @@
 import numpy as np
 import torch
 from PIL import Image
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionConfig, CLIPVisionModelWithProjection, CLIPImageProcessor
 
 from diffusers import (
     AutoencoderKL,
@@ -117,6 +117,31 @@ def get_dummy_components(self, skip_first_text_encoder=False):
 
         text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
         tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        
+        torch.manual_seed(0)
+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=32,
+            image_size=224,
+            projection_dim=32,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+        
+        feature_extractor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
 
         components = {
             "unet": unet,
@@ -126,8 +151,8 @@ def get_dummy_components(self, skip_first_text_encoder=False):
             "tokenizer": tokenizer if not skip_first_text_encoder else None,
             "text_encoder_2": text_encoder_2,
             "tokenizer_2": tokenizer_2,
-            "image_encoder": None,
-            "feature_extractor": None,
+            "image_encoder": image_encoder,
+            "feature_extractor": feature_extractor,
             "requires_aesthetics_score": True,
         }
         return components

From 5e60de565e90e2f44b17f003504baef25045e09d Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 11:36:58 +0530
Subject: [PATCH 060/139] add: integration test

---
 tests/models/test_ip_adapters.py | 86 ++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 tests/models/test_ip_adapters.py

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
new file mode 100644
index 000000000000..8523c670e162
--- /dev/null
+++ b/tests/models/test_ip_adapters.py
@@ -0,0 +1,86 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from diffusers import StableDiffusionPipeline
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    nightly,
+    require_torch_gpu,
+    torch_device,
+)
+
+
+enable_full_determinism()
+
+
+@nightly
+@require_torch_gpu
+class IPAdapterSDIntegrationTests(unittest.TestCase):
+    dtype = torch.float16
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_image_encoder(self, repo_id, subfolder):
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            repo_id, subfolder=subfolder, torch_dtype=self.dtype
+        ).to(torch_device)
+        return image_encoder
+
+    def get_image_processor(self, repo_id):
+        image_processor = CLIPImageProcessor.from_pretrained(repo_id)
+        return image_processor
+
+    def get_dummy_inputs(self):
+        image = load_image(
+            "https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png"
+        )
+        input_kwargs = {
+            "prompt": "best quality, high quality",
+            "negative_prompt": "monochrome, lowres, bad anatomy, worst quality, low quality",
+            "num_inference_steps": 2,
+            "generator": torch.Generator(device="cpu").manual_seed(33),
+            "ip_adapter_image": image,
+            "output_type": "np",
+        }
+        return input_kwargs
+
+    def text_to_image(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", sunfolder="models/image_encoder")
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, torch_dtype=self.dtype
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+        inputs = self.get_dummy_inputs()
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+        slice = image_slice.tolist()
+        print(", ".join([str(round(x, 4)) for x in slice]))
+
+        expected_slice = np.array([list(range(9))]).astype("float32")
+
+        assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)

From 95797b5623f11bb6851cf9ec4238d30ba8abbc2e Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 11:46:07 +0530
Subject: [PATCH 061/139] add: test_ prefix.

---
 tests/models/test_ip_adapters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index 8523c670e162..3c349b92408f 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -67,7 +67,7 @@ def get_dummy_inputs(self):
         }
         return input_kwargs
 
-    def text_to_image(self):
+    def test_text_to_image(self):
         image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", sunfolder="models/image_encoder")
         pipeline = StableDiffusionPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, torch_dtype=self.dtype

From b04cdcfd2dd8e3ee3ded3e10a585891d3e30a3e2 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 11:46:44 +0530
Subject: [PATCH 062/139] subfolder

---
 tests/models/test_ip_adapters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index 3c349b92408f..515a6b2a1fa5 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -68,7 +68,7 @@ def get_dummy_inputs(self):
         return input_kwargs
 
     def test_text_to_image(self):
-        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", sunfolder="models/image_encoder")
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
         pipeline = StableDiffusionPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, torch_dtype=self.dtype
         )

From be731674ec960fc44006a6e5fbdd5ca13c83fd04 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 11:51:18 +0530
Subject: [PATCH 063/139] tests

---
 .../pipeline_stable_diffusion_xl_img2img.py        |  9 ++++++++-
 .../pipeline_stable_diffusion_xl_inpaint.py        |  9 ++++++++-
 tests/models/test_ip_adapters.py                   |  4 ++--
 .../test_stable_diffusion_xl_img2img.py            | 13 ++++++++++---
 .../test_stable_diffusion_xl_inpaint.py            | 14 +++++++++++---
 5 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index faa37c7828f4..a50bbd14d8d7 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -169,7 +169,14 @@ class StableDiffusionXLImg2ImgPipeline(
             watermarker will be used.
     """
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
-    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2", "image_encoder", "feature_extractor"]
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
     _callback_tensor_inputs = [
         "latents",
         "prompt_embeds",
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index e00cf6a4d125..5db74cb089b0 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -315,7 +315,14 @@ class StableDiffusionXLInpaintPipeline(
     """
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
 
-    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2", "image_encoder", "feature_extractor"]
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
     _callback_tensor_inputs = [
         "latents",
         "prompt_embeds",
diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index 515a6b2a1fa5..c7e737718127 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -60,7 +60,7 @@ def get_dummy_inputs(self):
         input_kwargs = {
             "prompt": "best quality, high quality",
             "negative_prompt": "monochrome, lowres, bad anatomy, worst quality, low quality",
-            "num_inference_steps": 2,
+            "num_inference_steps": 5,
             "generator": torch.Generator(device="cpu").manual_seed(33),
             "ip_adapter_image": image,
             "output_type": "np",
@@ -70,7 +70,7 @@ def get_dummy_inputs(self):
     def test_text_to_image(self):
         image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
         pipeline = StableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, torch_dtype=self.dtype
+            "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype
         )
         pipeline.to(torch_device)
         pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
index ea5ae1c0483e..e4bbb00738cc 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -18,7 +18,15 @@
 
 import numpy as np
 import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionConfig, CLIPVisionModelWithProjection, CLIPImageProcessor
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
 
 from diffusers import (
     AutoencoderKL,
@@ -127,7 +135,7 @@ def get_dummy_components(self, skip_first_text_encoder=False):
         )
 
         image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
-        
+
         feature_extractor = CLIPImageProcessor(
             crop_size=224,
             do_center_crop=True,
@@ -139,7 +147,6 @@ def get_dummy_components(self, skip_first_text_encoder=False):
             size=224,
         )
 
-
         components = {
             "unet": unet,
             "scheduler": scheduler,
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
index 6f8154c3f16c..431338db1209 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
@@ -20,7 +20,15 @@
 import numpy as np
 import torch
 from PIL import Image
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionConfig, CLIPVisionModelWithProjection, CLIPImageProcessor
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
 
 from diffusers import (
     AutoencoderKL,
@@ -117,7 +125,7 @@ def get_dummy_components(self, skip_first_text_encoder=False):
 
         text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
         tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        
+
         torch.manual_seed(0)
         image_encoder_config = CLIPVisionConfig(
             hidden_size=32,
@@ -131,7 +139,7 @@ def get_dummy_components(self, skip_first_text_encoder=False):
         )
 
         image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
-        
+
         feature_extractor = CLIPImageProcessor(
             crop_size=224,
             do_center_crop=True,

From fb401d45d7d04c17ff9cc8ad4d175db1a74ae64c Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 11:59:48 +0530
Subject: [PATCH 064/139] add: image-to-image

---
 tests/models/test_ip_adapters.py | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index c7e737718127..7f6b9a970e68 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -20,7 +20,7 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 
-from diffusers import StableDiffusionPipeline
+from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
@@ -53,7 +53,7 @@ def get_image_processor(self, repo_id):
         image_processor = CLIPImageProcessor.from_pretrained(repo_id)
         return image_processor
 
-    def get_dummy_inputs(self):
+    def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False):
         image = load_image(
             "https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png"
         )
@@ -65,6 +65,11 @@ def get_dummy_inputs(self):
             "ip_adapter_image": image,
             "output_type": "np",
         }
+        if for_image_to_image:
+            image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg")
+            ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/river.png")
+            input_kwargs.update({"image": image, "ip_adapter_image": ip_image})
+
         return input_kwargs
 
     def test_text_to_image(self):
@@ -78,9 +83,26 @@ def test_text_to_image(self):
         inputs = self.get_dummy_inputs()
         images = pipeline(**inputs).images
         image_slice = images[0, :3, :3, -1].flatten()
+
+        expected_slice = np.array([0.8047, 0.8774, 0.9248, 0.9155, 0.9814, 1.0, 0.9678, 1.0, 1.0])
+
+        assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
+
+    def test_image_to_image(self):
+        StableDiffusionImg2ImgPipeline
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        pipeline = StableDiffusionImg2ImgPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+        inputs = self.get_dummy_inputs(for_image_to_image=True)
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
         slice = image_slice.tolist()
         print(", ".join([str(round(x, 4)) for x in slice]))
 
-        expected_slice = np.array([list(range(9))]).astype("float32")
+        expected_slice = np.array([0.8047, 0.8774, 0.9248, 0.9155, 0.9814, 1.0, 0.9678, 1.0, 1.0])
 
         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)

From 86b4f09ca0caba7c39cd879c019ffd1bcd21b115 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Nov 2023 06:31:19 +0000
Subject: [PATCH 065/139] fix bunch tests

---
 .../pipeline_stable_diffusion_xl_img2img.py        |  9 ++++++++-
 .../pipeline_stable_diffusion_xl_inpaint.py        |  9 ++++++++-
 .../stable_diffusion/test_stable_diffusion.py      |  1 +
 .../test_stable_diffusion_inpaint.py               |  2 ++
 .../test_stable_diffusion_model_editing.py         |  1 -
 .../test_stable_diffusion_v_pred.py                |  3 +++
 .../test_stable_diffusion_xl_img2img.py            | 13 ++++++++++---
 .../test_stable_diffusion_xl_inpaint.py            | 14 +++++++++++---
 tests/pipelines/test_pipelines.py                  | 13 +++++++++++--
 9 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index faa37c7828f4..a50bbd14d8d7 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -169,7 +169,14 @@ class StableDiffusionXLImg2ImgPipeline(
             watermarker will be used.
     """
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
-    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2", "image_encoder", "feature_extractor"]
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
     _callback_tensor_inputs = [
         "latents",
         "prompt_embeds",
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index e00cf6a4d125..5db74cb089b0 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -315,7 +315,14 @@ class StableDiffusionXLInpaintPipeline(
     """
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
 
-    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2", "image_encoder", "feature_extractor"]
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
     _callback_tensor_inputs = [
         "latents",
         "prompt_embeds",
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index ad77cc3e2b22..133f468b8d13 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -160,6 +160,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 59c21ed38b51..8072311568d7 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -151,6 +151,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 
@@ -333,6 +334,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
index ad017f2241b2..27c6a65b6395 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
@@ -92,7 +92,6 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
-            "image_encoder": None,
         }
         return components
 
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
index e2d476dec502..09034789c61c 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
@@ -127,6 +127,7 @@ def test_stable_diffusion_v_pred_ddim(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=None,
+            image_encoder=None,
             requires_safety_checker=False,
         )
         sd_pipe = sd_pipe.to(device)
@@ -176,6 +177,7 @@ def test_stable_diffusion_v_pred_k_euler(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=None,
+            image_encoder=None,
             requires_safety_checker=False,
         )
         sd_pipe = sd_pipe.to(device)
@@ -236,6 +238,7 @@ def test_stable_diffusion_v_pred_fp16(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=None,
+            image_encoder=None,
             requires_safety_checker=False,
         )
         sd_pipe = sd_pipe.to(torch_device)
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
index ea5ae1c0483e..e4bbb00738cc 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -18,7 +18,15 @@
 
 import numpy as np
 import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionConfig, CLIPVisionModelWithProjection, CLIPImageProcessor
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
 
 from diffusers import (
     AutoencoderKL,
@@ -127,7 +135,7 @@ def get_dummy_components(self, skip_first_text_encoder=False):
         )
 
         image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
-        
+
         feature_extractor = CLIPImageProcessor(
             crop_size=224,
             do_center_crop=True,
@@ -139,7 +147,6 @@ def get_dummy_components(self, skip_first_text_encoder=False):
             size=224,
         )
 
-
         components = {
             "unet": unet,
             "scheduler": scheduler,
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
index 6f8154c3f16c..431338db1209 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
@@ -20,7 +20,15 @@
 import numpy as np
 import torch
 from PIL import Image
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionConfig, CLIPVisionModelWithProjection, CLIPImageProcessor
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
 
 from diffusers import (
     AutoencoderKL,
@@ -117,7 +125,7 @@ def get_dummy_components(self, skip_first_text_encoder=False):
 
         text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
         tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        
+
         torch.manual_seed(0)
         image_encoder_config = CLIPVisionConfig(
             hidden_size=32,
@@ -131,7 +139,7 @@ def get_dummy_components(self, skip_first_text_encoder=False):
         )
 
         image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
-        
+
         feature_extractor = CLIPImageProcessor(
             crop_size=224,
             do_center_crop=True,
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index 875fd787c8b0..82f2f6f03643 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -1137,8 +1137,8 @@ def test_stable_diffusion_components(self):
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
         ).to(torch_device)
-        img2img = StableDiffusionImg2ImgPipeline(**inpaint.components).to(torch_device)
-        text2img = StableDiffusionPipeline(**inpaint.components).to(torch_device)
+        img2img = StableDiffusionImg2ImgPipeline(**inpaint.components, image_encoder=None).to(torch_device)
+        text2img = StableDiffusionPipeline(**inpaint.components, image_encoder=None).to(torch_device)
 
         prompt = "A painting of a squirrel eating a burger"
 
@@ -1185,6 +1185,7 @@ def test_pipe_false_offload_warn(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
+            image_encoder=None,
         )
 
         sd.enable_model_cpu_offload()
@@ -1203,6 +1204,7 @@ def test_pipe_false_offload_warn(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
+            image_encoder=None,
         )
 
     def test_set_scheduler(self):
@@ -1220,6 +1222,7 @@ def test_set_scheduler(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
+            image_encoder=None,
         )
 
         sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
@@ -1252,6 +1255,7 @@ def test_set_component_to_none(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
+            image_encoder=None,
         )
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -1293,6 +1297,7 @@ def test_set_scheduler_consistency(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
+            image_encoder=None,
         )
 
         pndm_config = sd.scheduler.config
@@ -1311,6 +1316,7 @@ def test_set_scheduler_consistency(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
+            image_encoder=None,
         )
 
         ddim_config = sd.scheduler.config
@@ -1404,6 +1410,7 @@ def test_optional_components(self):
             tokenizer=tokenizer,
             safety_checker=unet,
             feature_extractor=self.dummy_extractor,
+            image_encoder=None,
         )
         sd = orig_sd
 
@@ -1543,6 +1550,7 @@ def test_pipe_to(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
+            image_encoder=None,
         )
 
         device_type = torch.device(torch_device).type
@@ -1604,6 +1612,7 @@ def test_pipe_same_device_id_offload(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
+            image_encoder=None,
         )
 
         sd.enable_model_cpu_offload(gpu_id=5)

From 66f7023514629e71b9bf2b06d3c4e7b34c393378 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 12:01:56 +0530
Subject: [PATCH 066/139] fix: assertion values.

---
 tests/models/test_ip_adapters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index 7f6b9a970e68..133ba1423fba 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -103,6 +103,6 @@ def test_image_to_image(self):
         slice = image_slice.tolist()
         print(", ".join([str(round(x, 4)) for x in slice]))
 
-        expected_slice = np.array([0.8047, 0.8774, 0.9248, 0.9155, 0.9814, 1.0, 0.9678, 1.0, 1.0])
+        expected_slice = np.array([0.2307, 0.2341, 0.2305, 0.24, 0.2268, 0.25, 0.2322, 0.2588, 0.2935])
 
         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)

From c904c630095effac564bfd1dd23243179aeb86a9 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 12:06:01 +0530
Subject: [PATCH 067/139] add: inpainting

---
 tests/models/test_ip_adapters.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index 133ba1423fba..971ffb1b74bf 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -20,7 +20,7 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 
-from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
+from diffusers import StableDiffusionImg2ImgPipeline, StableDiffusionPipeline
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
@@ -69,6 +69,11 @@ def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False):
             image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg")
             ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/river.png")
             input_kwargs.update({"image": image, "ip_adapter_image": ip_image})
+        elif for_inpainting:
+            image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/inpaint_image.png")
+            mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/mask.png")
+            ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/girl.png")
+            input_kwargs.update({"image": image, "mask_image": mask, "ip_adapter_image": ip_image})
 
         return input_kwargs
 
@@ -106,3 +111,22 @@ def test_image_to_image(self):
         expected_slice = np.array([0.2307, 0.2341, 0.2305, 0.24, 0.2268, 0.25, 0.2322, 0.2588, 0.2935])
 
         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
+
+    def test_inpainting(self):
+        StableDiffusionImg2ImgPipeline
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+        pipeline = StableDiffusionImg2ImgPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+        inputs = self.get_dummy_inputs(for_inpainting=True)
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+        slice = image_slice.tolist()
+        print(", ".join([str(round(x, 4)) for x in slice]))
+
+        expected_slice = np.array([0.2307, 0.2341, 0.2305, 0.24, 0.2268, 0.25, 0.2322, 0.2588, 0.2935])
+
+        assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)

From 908579750e28ed0c09df01852c60c85847af756b Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 12:07:37 +0530
Subject: [PATCH 068/139] fix: assertion values for inpainting

---
 tests/models/test_ip_adapters.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index 971ffb1b74bf..ef4d4617a30d 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -105,8 +105,6 @@ def test_image_to_image(self):
         inputs = self.get_dummy_inputs(for_image_to_image=True)
         images = pipeline(**inputs).images
         image_slice = images[0, :3, :3, -1].flatten()
-        slice = image_slice.tolist()
-        print(", ".join([str(round(x, 4)) for x in slice]))
 
         expected_slice = np.array([0.2307, 0.2341, 0.2305, 0.24, 0.2268, 0.25, 0.2322, 0.2588, 0.2935])
 
@@ -124,9 +122,7 @@ def test_inpainting(self):
         inputs = self.get_dummy_inputs(for_inpainting=True)
         images = pipeline(**inputs).images
         image_slice = images[0, :3, :3, -1].flatten()
-        slice = image_slice.tolist()
-        print(", ".join([str(round(x, 4)) for x in slice]))
 
-        expected_slice = np.array([0.2307, 0.2341, 0.2305, 0.24, 0.2268, 0.25, 0.2322, 0.2588, 0.2935])
+        expected_slice = np.array([0.3618, 0.3313, 0.2983, 0.3708, 0.345, 0.311, 0.3608, 0.343, 0.3335])
 
         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)

From ab060c41fb69b29afc70ace786e920bbf10556e4 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 12:11:00 +0530
Subject: [PATCH 069/139] fix: inpainting tests

---
 tests/models/test_ip_adapters.py | 102 ++++++++++++++++++++++++++++++-
 1 file changed, 99 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index ef4d4617a30d..1c3df8a834a0 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -20,7 +20,7 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 
-from diffusers import StableDiffusionImg2ImgPipeline, StableDiffusionPipeline
+from diffusers import StableDiffusionImg2ImgPipeline, StableDiffusionPipeline, StableDiffusionInpaintPipeline
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
@@ -111,9 +111,8 @@ def test_image_to_image(self):
         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
 
     def test_inpainting(self):
-        StableDiffusionImg2ImgPipeline
         image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
-        pipeline = StableDiffusionImg2ImgPipeline.from_pretrained(
+        pipeline = StableDiffusionInpaintPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype
         )
         pipeline.to(torch_device)
@@ -122,7 +121,104 @@ def test_inpainting(self):
         inputs = self.get_dummy_inputs(for_inpainting=True)
         images = pipeline(**inputs).images
         image_slice = images[0, :3, :3, -1].flatten()
+        slice = image_slice.tolist()
+        print(", ".join([str(round(x, 4)) for x in slice]))
 
         expected_slice = np.array([0.3618, 0.3313, 0.2983, 0.3708, 0.345, 0.311, 0.3608, 0.343, 0.3335])
 
         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
+
+
+# @nightly
+# @require_torch_gpu
+# class IPAdapterSDXLIntegrationTests(unittest.TestCase):
+#     dtype = torch.float16
+
+#     def tearDown(self):
+#         super().tearDown()
+#         gc.collect()
+#         torch.cuda.empty_cache()
+
+#     def get_image_encoder(self, repo_id, subfolder):
+#         image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+#             repo_id, subfolder=subfolder, torch_dtype=self.dtype
+#         ).to(torch_device)
+#         return image_encoder
+
+#     def get_image_processor(self, repo_id):
+#         image_processor = CLIPImageProcessor.from_pretrained(repo_id)
+#         return image_processor
+
+#     def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False):
+#         image = load_image(
+#             "https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png"
+#         )
+#         input_kwargs = {
+#             "prompt": "best quality, high quality",
+#             "negative_prompt": "monochrome, lowres, bad anatomy, worst quality, low quality",
+#             "num_inference_steps": 5,
+#             "generator": torch.Generator(device="cpu").manual_seed(33),
+#             "ip_adapter_image": image,
+#             "output_type": "np",
+#         }
+#         if for_image_to_image:
+#             image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg")
+#             ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/river.png")
+#             input_kwargs.update({"image": image, "ip_adapter_image": ip_image})
+#         elif for_inpainting:
+#             image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/inpaint_image.png")
+#             mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/mask.png")
+#             ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/girl.png")
+#             input_kwargs.update({"image": image, "mask_image": mask, "ip_adapter_image": ip_image})
+
+#         return input_kwargs
+
+#     def test_text_to_image_sdxl(self):
+#         image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+#         pipeline = StableDiffusionPipeline.from_pretrained(
+#             "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype
+#         )
+#         pipeline.to(torch_device)
+#         pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+#         inputs = self.get_dummy_inputs()
+#         images = pipeline(**inputs).images
+#         image_slice = images[0, :3, :3, -1].flatten()
+
+#         expected_slice = np.array([0.8047, 0.8774, 0.9248, 0.9155, 0.9814, 1.0, 0.9678, 1.0, 1.0])
+
+#         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
+
+#     def test_image_to_image_sdxl(self):
+#         StableDiffusionImg2ImgPipeline
+#         image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+#         pipeline = StableDiffusionImg2ImgPipeline.from_pretrained(
+#             "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype
+#         )
+#         pipeline.to(torch_device)
+#         pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+#         inputs = self.get_dummy_inputs(for_image_to_image=True)
+#         images = pipeline(**inputs).images
+#         image_slice = images[0, :3, :3, -1].flatten()
+
+#         expected_slice = np.array([0.2307, 0.2341, 0.2305, 0.24, 0.2268, 0.25, 0.2322, 0.2588, 0.2935])
+
+#         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
+
+#     def test_inpainting_sdxl(self):
+#         StableDiffusionImg2ImgPipeline
+#         image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
+#         pipeline = StableDiffusionImg2ImgPipeline.from_pretrained(
+#             "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype
+#         )
+#         pipeline.to(torch_device)
+#         pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+#         inputs = self.get_dummy_inputs(for_inpainting=True)
+#         images = pipeline(**inputs).images
+#         image_slice = images[0, :3, :3, -1].flatten()
+
+#         expected_slice = np.array([0.3618, 0.3313, 0.2983, 0.3708, 0.345, 0.311, 0.3608, 0.343, 0.3335])
+
+#         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)

From 0d7ef92ad66304b06e03c2a6767d8ad74417d54c Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 12:12:27 +0530
Subject: [PATCH 070/139] fix: more

---
 tests/models/test_ip_adapters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index 1c3df8a834a0..c18bf7690f7a 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -124,7 +124,7 @@ def test_inpainting(self):
         slice = image_slice.tolist()
         print(", ".join([str(round(x, 4)) for x in slice]))
 
-        expected_slice = np.array([0.3618, 0.3313, 0.2983, 0.3708, 0.345, 0.311, 0.3608, 0.343, 0.3335])
+        expected_slice = np.array([0.2705, 0.2395, 0.2209, 0.2312, 0.2102, 0.2104, 0.2178, 0.2065, 0.1997])
 
         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
 

From b132f5007bab392564eafd9ff67f26ff25567881 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Nov 2023 06:45:00 +0000
Subject: [PATCH 071/139] fix auto test

---
 tests/pipelines/test_pipelines_auto.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/pipelines/test_pipelines_auto.py b/tests/pipelines/test_pipelines_auto.py
index 1cd29565b8de..4bac90f2091e 100644
--- a/tests/pipelines/test_pipelines_auto.py
+++ b/tests/pipelines/test_pipelines_auto.py
@@ -116,7 +116,7 @@ def test_from_pipe_controlnet_text2img(self):
         assert pipe.__class__.__name__ == "StableDiffusionControlNetPipeline"
         assert "controlnet" in pipe.components
 
-        pipe = AutoPipelineForText2Image.from_pipe(pipe, controlnet=None)
+        pipe = AutoPipelineForText2Image.from_pipe(pipe, controlnet=None, image_encoder=None)
         assert pipe.__class__.__name__ == "StableDiffusionPipeline"
         assert "controlnet" not in pipe.components
 
@@ -128,7 +128,7 @@ def test_from_pipe_controlnet_img2img(self):
         assert pipe.__class__.__name__ == "StableDiffusionControlNetImg2ImgPipeline"
         assert "controlnet" in pipe.components
 
-        pipe = AutoPipelineForImage2Image.from_pipe(pipe, controlnet=None)
+        pipe = AutoPipelineForImage2Image.from_pipe(pipe, controlnet=None, image_encoder=None)
         assert pipe.__class__.__name__ == "StableDiffusionImg2ImgPipeline"
         assert "controlnet" not in pipe.components
 
@@ -140,7 +140,7 @@ def test_from_pipe_controlnet_inpaint(self):
         assert pipe.__class__.__name__ == "StableDiffusionControlNetInpaintPipeline"
         assert "controlnet" in pipe.components
 
-        pipe = AutoPipelineForInpainting.from_pipe(pipe, controlnet=None)
+        pipe = AutoPipelineForInpainting.from_pipe(pipe, controlnet=None, image_encoder=None)
         assert pipe.__class__.__name__ == "StableDiffusionInpaintPipeline"
         assert "controlnet" not in pipe.components
 
@@ -152,7 +152,7 @@ def test_from_pipe_controlnet_new_task(self):
         assert pipe_control_img2img.__class__.__name__ == "StableDiffusionControlNetImg2ImgPipeline"
         assert "controlnet" in pipe_control_img2img.components
 
-        pipe_inpaint = AutoPipelineForInpainting.from_pipe(pipe_control_img2img, controlnet=None)
+        pipe_inpaint = AutoPipelineForInpainting.from_pipe(pipe_control_img2img, controlnet=None, image_encoder=None)
         assert pipe_inpaint.__class__.__name__ == "StableDiffusionInpaintPipeline"
         assert "controlnet" not in pipe_inpaint.components
 

From 188f1d704ae161ecde3803df7cb6fb80752670da Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 12:23:45 +0530
Subject: [PATCH 072/139] add: sdxl integration tests

---
 tests/models/test_ip_adapters.py | 198 +++++++++++++++----------------
 1 file changed, 99 insertions(+), 99 deletions(-)

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index c18bf7690f7a..d0dd167f113e 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -20,7 +20,14 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 
-from diffusers import StableDiffusionImg2ImgPipeline, StableDiffusionPipeline, StableDiffusionInpaintPipeline
+from diffusers import (
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipeline,
+    StableDiffusionPipeline,
+    StableDiffusionXLImg2ImgPipeline,
+    StableDiffusionXLInpaintPipeline,
+    StableDiffusionXLPipeline,
+)
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
@@ -33,9 +40,7 @@
 enable_full_determinism()
 
 
-@nightly
-@require_torch_gpu
-class IPAdapterSDIntegrationTests(unittest.TestCase):
+class IPAdapterTestsMixin(unittest.TestCase):
     dtype = torch.float16
 
     def tearDown(self):
@@ -53,10 +58,13 @@ def get_image_processor(self, repo_id):
         image_processor = CLIPImageProcessor.from_pretrained(repo_id)
         return image_processor
 
-    def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False):
+    def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False, for_sdxl=False):
         image = load_image(
             "https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png"
         )
+        if for_sdxl:
+            image = image.resize((1024, 1024))
+
         input_kwargs = {
             "prompt": "best quality, high quality",
             "negative_prompt": "monochrome, lowres, bad anatomy, worst quality, low quality",
@@ -68,15 +76,31 @@ def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False):
         if for_image_to_image:
             image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg")
             ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/river.png")
+
+            if for_sdxl:
+                image = image.resize((1024, 1024))
+                ip_image = ip_image.resize((1024, 1024))
+
             input_kwargs.update({"image": image, "ip_adapter_image": ip_image})
+
         elif for_inpainting:
             image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/inpaint_image.png")
             mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/mask.png")
             ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/girl.png")
+
+            if for_sdxl:
+                image = image.resize((1024, 1024))
+                mask = mask.resize((1024, 1024))
+                ip_image = ip_image.resize((1024, 1024))
+
             input_kwargs.update({"image": image, "mask_image": mask, "ip_adapter_image": ip_image})
 
         return input_kwargs
 
+
+@nightly
+@require_torch_gpu
+class IPAdapterSDIntegrationTests(IPAdapterTestsMixin):
     def test_text_to_image(self):
         image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
         pipeline = StableDiffusionPipeline.from_pretrained(
@@ -121,104 +145,80 @@ def test_inpainting(self):
         inputs = self.get_dummy_inputs(for_inpainting=True)
         images = pipeline(**inputs).images
         image_slice = images[0, :3, :3, -1].flatten()
+
+        expected_slice = np.array([0.2705, 0.2395, 0.2209, 0.2312, 0.2102, 0.2104, 0.2178, 0.2065, 0.1997])
+
+        assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
+
+
+@nightly
+@require_torch_gpu
+class IPAdapterSDXLIntegrationTests(IPAdapterTestsMixin):
+    def test_text_to_image_sdxl(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="sdxl_models/image_encoder")
+        feature_extractor = self.get_image_processor("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            torch_dtype=self.dtype,
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
+
+        inputs = self.get_dummy_inputs()
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
         slice = image_slice.tolist()
         print(", ".join([str(round(x, 4)) for x in slice]))
 
-        expected_slice = np.array([0.2705, 0.2395, 0.2209, 0.2312, 0.2102, 0.2104, 0.2178, 0.2065, 0.1997])
+        expected_slice = np.array([0.8047, 0.8774, 0.9248, 0.9155, 0.9814, 1.0, 0.9678, 1.0, 1.0])
+
+        assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
+
+    def test_image_to_image_sdxl(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="sdxl_models/image_encoder")
+        feature_extractor = self.get_image_processor("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+
+        pipeline = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            torch_dtype=self.dtype,
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
+
+        inputs = self.get_dummy_inputs(for_image_to_image=True)
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+        slice = image_slice.tolist()
+        print(", ".join([str(round(x, 4)) for x in slice]))
+
+        expected_slice = np.array([0.2307, 0.2341, 0.2305, 0.24, 0.2268, 0.25, 0.2322, 0.2588, 0.2935])
 
         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
 
+    def test_inpainting_sdxl(self):
+        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="sdxl_models/image_encoder")
+        feature_extractor = self.get_image_processor("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
 
-# @nightly
-# @require_torch_gpu
-# class IPAdapterSDXLIntegrationTests(unittest.TestCase):
-#     dtype = torch.float16
-
-#     def tearDown(self):
-#         super().tearDown()
-#         gc.collect()
-#         torch.cuda.empty_cache()
-
-#     def get_image_encoder(self, repo_id, subfolder):
-#         image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-#             repo_id, subfolder=subfolder, torch_dtype=self.dtype
-#         ).to(torch_device)
-#         return image_encoder
-
-#     def get_image_processor(self, repo_id):
-#         image_processor = CLIPImageProcessor.from_pretrained(repo_id)
-#         return image_processor
-
-#     def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False):
-#         image = load_image(
-#             "https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png"
-#         )
-#         input_kwargs = {
-#             "prompt": "best quality, high quality",
-#             "negative_prompt": "monochrome, lowres, bad anatomy, worst quality, low quality",
-#             "num_inference_steps": 5,
-#             "generator": torch.Generator(device="cpu").manual_seed(33),
-#             "ip_adapter_image": image,
-#             "output_type": "np",
-#         }
-#         if for_image_to_image:
-#             image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg")
-#             ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/river.png")
-#             input_kwargs.update({"image": image, "ip_adapter_image": ip_image})
-#         elif for_inpainting:
-#             image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/inpaint_image.png")
-#             mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/mask.png")
-#             ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/girl.png")
-#             input_kwargs.update({"image": image, "mask_image": mask, "ip_adapter_image": ip_image})
-
-#         return input_kwargs
-
-#     def test_text_to_image_sdxl(self):
-#         image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
-#         pipeline = StableDiffusionPipeline.from_pretrained(
-#             "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype
-#         )
-#         pipeline.to(torch_device)
-#         pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
-
-#         inputs = self.get_dummy_inputs()
-#         images = pipeline(**inputs).images
-#         image_slice = images[0, :3, :3, -1].flatten()
-
-#         expected_slice = np.array([0.8047, 0.8774, 0.9248, 0.9155, 0.9814, 1.0, 0.9678, 1.0, 1.0])
-
-#         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
-
-#     def test_image_to_image_sdxl(self):
-#         StableDiffusionImg2ImgPipeline
-#         image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
-#         pipeline = StableDiffusionImg2ImgPipeline.from_pretrained(
-#             "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype
-#         )
-#         pipeline.to(torch_device)
-#         pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
-
-#         inputs = self.get_dummy_inputs(for_image_to_image=True)
-#         images = pipeline(**inputs).images
-#         image_slice = images[0, :3, :3, -1].flatten()
-
-#         expected_slice = np.array([0.2307, 0.2341, 0.2305, 0.24, 0.2268, 0.25, 0.2322, 0.2588, 0.2935])
-
-#         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
-
-#     def test_inpainting_sdxl(self):
-#         StableDiffusionImg2ImgPipeline
-#         image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
-#         pipeline = StableDiffusionImg2ImgPipeline.from_pretrained(
-#             "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype
-#         )
-#         pipeline.to(torch_device)
-#         pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
-
-#         inputs = self.get_dummy_inputs(for_inpainting=True)
-#         images = pipeline(**inputs).images
-#         image_slice = images[0, :3, :3, -1].flatten()
-
-#         expected_slice = np.array([0.3618, 0.3313, 0.2983, 0.3708, 0.345, 0.311, 0.3608, 0.343, 0.3335])
-
-#         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
+        pipeline = StableDiffusionXLInpaintPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            torch_dtype=self.dtype,
+        )
+        pipeline.to(torch_device)
+        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
+
+        inputs = self.get_dummy_inputs(for_inpainting=True)
+        images = pipeline(**inputs).images
+        image_slice = images[0, :3, :3, -1].flatten()
+        slice = image_slice.tolist()
+        print(", ".join([str(round(x, 4)) for x in slice]))
+
+        expected_slice = np.array([0.3618, 0.3313, 0.2983, 0.3708, 0.345, 0.311, 0.3608, 0.343, 0.3335])
+
+        assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)

From 36e79035b0b1fbce33ae5509ba9fc6b38e552472 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 12:41:08 +0530
Subject: [PATCH 073/139] fix: assertion values for sdxl.

---
 tests/models/test_ip_adapters.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index d0dd167f113e..6a79a332f772 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -170,10 +170,8 @@ def test_text_to_image_sdxl(self):
         inputs = self.get_dummy_inputs()
         images = pipeline(**inputs).images
         image_slice = images[0, :3, :3, -1].flatten()
-        slice = image_slice.tolist()
-        print(", ".join([str(round(x, 4)) for x in slice]))
 
-        expected_slice = np.array([0.8047, 0.8774, 0.9248, 0.9155, 0.9814, 1.0, 0.9678, 1.0, 1.0])
+        expected_slice = np.array([0.0968, 0.0959, 0.0852, 0.0912, 0.0948, 0.093, 0.0893, 0.0932, 0.0923])
 
         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
 
@@ -193,10 +191,8 @@ def test_image_to_image_sdxl(self):
         inputs = self.get_dummy_inputs(for_image_to_image=True)
         images = pipeline(**inputs).images
         image_slice = images[0, :3, :3, -1].flatten()
-        slice = image_slice.tolist()
-        print(", ".join([str(round(x, 4)) for x in slice]))
 
-        expected_slice = np.array([0.2307, 0.2341, 0.2305, 0.24, 0.2268, 0.25, 0.2322, 0.2588, 0.2935])
+        expected_slice = np.array([0.0653, 0.0704, 0.0725, 0.0741, 0.0702, 0.0647, 0.0782, 0.0799, 0.0752])
 
         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
 

From 4f34e08673765e4860afac704d7ba24ef581ef45 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Nov 2023 07:31:29 +0000
Subject: [PATCH 074/139] fix last one

---
 .../stable_diffusion_2/test_stable_diffusion_inpaint.py          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
index 92e8857610ea..41b9f83914a6 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
@@ -108,6 +108,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 

From 756534b3d3c6cec9872b360baf73af36f214d826 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 13:07:21 +0530
Subject: [PATCH 075/139] fix: assertion for inpainting

---
 tests/models/test_ip_adapters.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index 6a79a332f772..701b01120d14 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -212,9 +212,8 @@ def test_inpainting_sdxl(self):
         inputs = self.get_dummy_inputs(for_inpainting=True)
         images = pipeline(**inputs).images
         image_slice = images[0, :3, :3, -1].flatten()
-        slice = image_slice.tolist()
-        print(", ".join([str(round(x, 4)) for x in slice]))
+        image_slice.tolist()
 
-        expected_slice = np.array([0.3618, 0.3313, 0.2983, 0.3708, 0.345, 0.311, 0.3608, 0.343, 0.3335])
+        expected_slice = np.array([0.1418, 0.1493, 0.1428, 0.146, 0.1491, 0.1501, 0.1473, 0.1501, 0.1516])
 
         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)

From a17655b18cf2a6e02967d31d3b2df780cbbd4a3c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Nov 2023 08:16:18 +0000
Subject: [PATCH 076/139] fix tiny encoder

---
 .../test_stable_diffusion_xl_img2img.py       | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
index e4bbb00738cc..16b82a1238bc 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -101,27 +101,6 @@ def get_dummy_components(self, skip_first_text_encoder=False):
             latent_channels=4,
             sample_size=128,
         )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            # SD2-specific config below
-            hidden_act="gelu",
-            projection_dim=32,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
-        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
         torch.manual_seed(0)
         image_encoder_config = CLIPVisionConfig(
             hidden_size=32,
@@ -147,6 +126,27 @@ def get_dummy_components(self, skip_first_text_encoder=False):
             size=224,
         )
 
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
         components = {
             "unet": unet,
             "scheduler": scheduler,

From 4d089303708a560b06f0d22669f5c1a59c13531a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Nov 2023 08:39:51 +0000
Subject: [PATCH 077/139] make quality

---
 .../pipeline_alt_diffusion_img2img.py         | 38 ++++++++++++++++---
 .../versatile_diffusion/modeling_text_unet.py | 11 +++++-
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index ea4a3128dee3..d1cca2a5c582 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -19,11 +19,11 @@
 import PIL.Image
 import torch
 from packaging import version
-from transformers import CLIPImageProcessor, XLMRobertaTokenizer
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, XLMRobertaTokenizer
 
 from ...configuration_utils import FrozenDict
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
@@ -111,7 +111,7 @@ def preprocess(image):
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker
 class AltDiffusionImg2ImgPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
 ):
     r"""
     Pipeline for text-guided image-to-image generation using Alt Diffusion.
@@ -124,6 +124,7 @@ class AltDiffusionImg2ImgPipeline(
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
     Args:
         vae ([`AutoencoderKL`]):
@@ -144,9 +145,8 @@ class AltDiffusionImg2ImgPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
-
     model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor"]
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
     _exclude_from_cpu_offload = ["safety_checker"]
     _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
 
@@ -159,6 +159,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
         requires_safety_checker: bool = True,
     ):
         super().__init__()
@@ -235,6 +236,7 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -456,6 +458,19 @@ def encode_prompt(
 
         return prompt_embeds, negative_prompt_embeds
 
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is None:
             has_nsfw_concept = None
@@ -684,6 +699,7 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -733,6 +749,7 @@ def __call__(
             negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated image. Choose between `PIL.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
@@ -827,6 +844,11 @@ def __call__(
         if self.do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
         # 4. Preprocess image
         image = self.image_processor.preprocess(image)
 
@@ -858,12 +880,18 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
+                if ip_adapter_image is not None:
+                    added_cond_kwargs = {"image_embeds": image_embeds}
+                else:
+                    added_cond_kwargs = None
+
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
                     cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
                     return_dict=False,
                 )[0]
 
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 60ea3d814b3a..cab924de4c38 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -1231,14 +1231,21 @@ def forward(
             image_embeds = added_cond_kwargs.get("image_embeds")
             encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
         elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
-            # Kandinsky 2.2 - style
             if "image_embeds" not in added_cond_kwargs:
                 raise ValueError(
                     f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires"
                     " the keyword argument `image_embeds` to be passed in  `added_conditions`"
                 )
             image_embeds = added_cond_kwargs.get("image_embeds")
-            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+            image_embeds = self.encoder_hid_proj(image_embeds)
+            # IP-adapter
+            if any("to_k_ip" in k for k in self.state_dict().keys()):
+                image_embeds = image_embeds.to(encoder_hidden_states.dtype)
+                encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
+            else:
+                # Kandinsky 2.2 - style
+                encoder_hidden_states = image_embeds
+
         # 2. pre-process
         sample = self.conv_in(sample)
 

From cb451b0421e1e571666d7b8c62f5d7b29b562df2 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Nov 2023 08:56:52 +0000
Subject: [PATCH 078/139] fix

---
 .../en/using-diffusers/loading_adapters.md    | 20 +++++++++++++++----
 .../test_alt_diffusion_img2img.py             |  2 ++
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md
index 0d4ed7d72f66..d6966ae84713 100644
--- a/docs/source/en/using-diffusers/loading_adapters.md
+++ b/docs/source/en/using-diffusers/loading_adapters.md
@@ -312,14 +312,20 @@ image
 
 [IP-Adapter](https://ip-adapter.github.io/) is an effective and lightweight adapter to achieve image prompt capability for the pre-trained text-to-image diffusion models. It is now available to use with most of our Stable Diffusion and Stable Diffusion XL pipelines. You can also use the IP-Adapter with other custom models fine-tuned from the same base model, as well as ControlNet and T2I adapters. Moreover, the image prompt can also work well with the text prompt to accomplish multimodal image generation.
 
-Let's look at an example where we use IP-Adapter with the Stable Diffusion text-to-image pipeline.
+Let's look at an example where we use IP-Adapter with the Stable Diffusion text-to-image pipeline. 
 
 ``` py
-from diffusers import AutoPipelineForText2Image
+from diffusers import AutoPipelineForText2Image, CLIPVisionModelWithProjection
 import torch
 from diffusers.utils import load_image
 
-pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+    "h94/IP-Adapter", 
+    subfolder="models/image_encoder",
+    torch_dtype=torch.float16,
+).to("cuda")
+
+pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, torch_dtype=torch.float16).to("cuda")
 ```
 
 Now you can load the IP-Adapter with [`~loaders.IPAdapterMixin.load_ip_adapter`] method. 
@@ -361,7 +367,13 @@ from diffusers import AutoPipelineForImage2Image
 import torch
 from diffusers.utils import load_image
 
-pipeline = AutoPipelineForImage2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+    "h94/IP-Adapter", 
+    subfolder="models/image_encoder",
+    torch_dtype=torch.float16,
+).to("cuda")
+
+pipeline = AutoPipelineForImage2Image.from_pretrained("runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, torch_dtype=torch.float16).to("cuda")
 
 image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg")
 ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/river.png")
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
index 57001f7bea52..3fd1a90172ca 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
@@ -141,6 +141,7 @@ def test_stable_diffusion_img2img_default_case(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
+            image_encoder=None,
         )
         alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=True)
         alt_pipe = alt_pipe.to(device)
@@ -205,6 +206,7 @@ def test_stable_diffusion_img2img_fp16(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
+            image_encoder=None,
         )
         alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False)
         alt_pipe = alt_pipe.to(torch_device)

From eed9900e449bb0306376c6f4c835f1726ec67df1 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 15:34:25 +0530
Subject: [PATCH 079/139] add: fast test.

---
 tests/models/test_ip_adapters.py | 204 ++++++++++++++++++++++++++++++-
 1 file changed, 200 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index 701b01120d14..d40a4653f307 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -18,16 +18,34 @@
 
 import numpy as np
 import torch
-from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+import torch.nn.functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
 
 from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
     StableDiffusionImg2ImgPipeline,
     StableDiffusionInpaintPipeline,
     StableDiffusionPipeline,
     StableDiffusionXLImg2ImgPipeline,
     StableDiffusionXLInpaintPipeline,
     StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.models.attention_processor import (
+    AttnProcessor,
+    AttnProcessor2_0,
+    IPAdapterAttnProcessor,
+    IPAdapterAttnProcessor2_0,
 )
+from diffusers.models.embeddings import ImageProjection
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
@@ -40,7 +58,185 @@
 enable_full_determinism()
 
 
-class IPAdapterTestsMixin(unittest.TestCase):
+class IPAdapterFastTests(unittest.TestCase):
+    hidden_dim = 32
+    num_image_text_embeds = 4
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=self.hidden_dim,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.hidden_dim,
+            intermediate_size=64,
+            layer_norm_eps=1e-05,
+            num_attention_heads=8,
+            num_hidden_layers=3,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        torch.manual_seed(0)
+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=self.hidden_dim,
+            projection_dim=self.hidden_dim,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            image_size=32,
+            intermediate_size=37,
+            patch_size=1,
+        )
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": image_encoder,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0, with_image=False):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        if with_image:
+            inputs.update({"ip_adapter_image": torch.randn(1, 3, 32, 32, generator=generator)})
+        return inputs
+
+    def get_attn_procs_for_ip_adapter(self, unet):
+        # Cross-attention modules.
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_processor_class = (
+                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                )
+                attn_procs[name] = attn_processor_class()
+            else:
+                attn_processor_class = (
+                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
+                )
+                attn_procs[name] = attn_processor_class(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
+                ).to(dtype=unet.dtype, device=unet.device)
+        return attn_procs
+
+    def get_ip_adapter_state_dict(self, unet):
+        # Image projection module.
+        image_projection = ImageProjection(
+            cross_attention_dim=self.hidden_dim, image_embed_dim=self.hidden_dim, num_image_text_embeds=4
+        )
+
+        # Attention modules.
+        attn_procs = self.get_attn_procs_for_ip_adapter(unet)
+
+        # Rename the keys.
+        cross_attention_params = {}
+        key_id = 1
+        for key, value in attn_procs.items():
+            if isinstance(attn_procs[key], torch.nn.Module):
+                current_sd = attn_procs[key].state_dict()
+                current_sd = {f"{key_id}.{k}": v for k, v in current_sd.items()}
+                cross_attention_params.update(current_sd)
+                key_id += 2
+
+        # Make it compatible.
+        image_projection_sd = image_projection.state_dict()
+        new_image_projection_sd = {}
+        for k in image_projection_sd:
+            if "image_embeds" in k:
+                new_k = k.replace("image_embeds", "proj")
+            else:
+                new_k = k
+            new_image_projection_sd.update({new_k: image_projection_sd[k]})
+
+        # Final.
+        final_state_dict = {}
+        final_state_dict.update({"image_proj": new_image_projection_sd, "ip_adapter": cross_attention_params})
+        return final_state_dict
+
+    def test_inference_fast(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        ip_adapter_state_dict = self.get_ip_adapter_state_dict(components["unet"])
+        sd_pipe.load_ip_adapter(ip_adapter_state_dict)
+        inputs = self.get_dummy_inputs(device, with_image=True)
+        output_ip_adapter = sd_pipe(**inputs).images
+
+        assert output_ip_adapter.shape == (1, 64, 64, 3)
+
+        assert not np.allclose(image_slice, output_ip_adapter[0, -3:, -3:, -1], atol=1e-4, rtol=1e-4)
+
+
+class IPAdapterNightlyTestsMixin(unittest.TestCase):
     dtype = torch.float16
 
     def tearDown(self):
@@ -100,7 +296,7 @@ def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False, for_s
 
 @nightly
 @require_torch_gpu
-class IPAdapterSDIntegrationTests(IPAdapterTestsMixin):
+class IPAdapterSDIntegrationTests(IPAdapterNightlyTestsMixin):
     def test_text_to_image(self):
         image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
         pipeline = StableDiffusionPipeline.from_pretrained(
@@ -153,7 +349,7 @@ def test_inpainting(self):
 
 @nightly
 @require_torch_gpu
-class IPAdapterSDXLIntegrationTests(IPAdapterTestsMixin):
+class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):
     def test_text_to_image_sdxl(self):
         image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="sdxl_models/image_encoder")
         feature_extractor = self.get_image_processor("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")

From 82a7e4de5fbd02b4a153f0f8e23d7287abde25db Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 16:08:23 +0530
Subject: [PATCH 080/139] add sdxl docs

---
 .../en/using-diffusers/loading_adapters.md    | 54 ++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md
index d6966ae84713..ac3f841b9246 100644
--- a/docs/source/en/using-diffusers/loading_adapters.md
+++ b/docs/source/en/using-diffusers/loading_adapters.md
@@ -312,6 +312,8 @@ image
 
 [IP-Adapter](https://ip-adapter.github.io/) is an effective and lightweight adapter to achieve image prompt capability for the pre-trained text-to-image diffusion models. It is now available to use with most of our Stable Diffusion and Stable Diffusion XL pipelines. You can also use the IP-Adapter with other custom models fine-tuned from the same base model, as well as ControlNet and T2I adapters. Moreover, the image prompt can also work well with the text prompt to accomplish multimodal image generation.
 
+You can find the officially available IP-Adapter checkpoints in [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter).
+
 Let's look at an example where we use IP-Adapter with the Stable Diffusion text-to-image pipeline. 
 
 ``` py
@@ -389,4 +391,54 @@ images = pipeline(
     strength=0.6,
 ).images
 images[0]
-```
\ No newline at end of file
+```
+
+IP-Adapters can be used with [Stable Diffusion XL](../api/pipelines/stable_diffusion/stable_diffusion_xl.md) (SDXL) for text-to-image, image-to-image, and inpainting pipelines. Below is an example for SDXL text-to-image.
+
+```python
+from diffusers import AutoPipelineForText2Image
+from diffusers.utils import load_image
+from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor
+import torch
+
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+    "h94/IP-Adapter", 
+    subfolder="sdxl_models/image_encoder",
+    torch_dtype=torch.float16,
+).to("cuda")
+feature_extractor = CLIPImageProcessor.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+  
+pipeline = AutoPipelineForText2Image.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    image_encoder=image_encoder,
+    feature_extractor=feature_extractor,
+    torch_dtype=torch.float16
+).to("cuda")
+
+image = load_image("https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/watercolor_painting.jpeg")
+
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
+
+generator = torch.Generator(device="cpu").manual_seed(33)
+image = pipeline(
+    prompt="best quality, high quality", 
+    ip_adapter_image=image,
+    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
+    num_inference_steps=25,
+    generator=generator,
+).images[0]
+image.save("sdxl_t2i.png")
+```
+
+<div class="flex justify-center">
+   <table border="1">
+    <tr>
+        <th>Input Image</th>
+        <th>Adapted Image</th>
+    </tr>
+    <tr>
+        <td><img src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/watercolor_painting.jpeg" alt="Input Image"></td>
+        <td><img src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/sdxl_t2i.png" alt="Adapted Image"></td>
+    </tr>
+    </table>
+</div>
\ No newline at end of file

From 5c179b95a7ffa6f936c5b9f82f02b508f252cce1 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Nov 2023 12:04:53 +0000
Subject: [PATCH 081/139] uP

---
 .../stable_diffusion/pipeline_stable_diffusion.py         | 8 +++-----
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py | 8 +++-----
 .../stable_diffusion/pipeline_stable_diffusion_inpaint.py | 8 +++-----
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index e4bcc3a66bcc..fac882d1836f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -816,6 +816,9 @@ def __call__(
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
+        # 6.5 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
         # 7. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
@@ -825,11 +828,6 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-                if ip_adapter_image is not None:
-                    added_cond_kwargs = {"image_embeds": image_embeds}
-                else:
-                    added_cond_kwargs = None
-
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 5d2f5a2840e5..6e600442e82a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -865,6 +865,9 @@ def __call__(
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
+        # 6.5 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
         # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
@@ -874,11 +877,6 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-                if ip_adapter_image is not None:
-                    added_cond_kwargs = {"image_embeds": image_embeds}
-                else:
-                    added_cond_kwargs = None
-
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 04f886e55bec..5ba36951736d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -1111,6 +1111,9 @@ def __call__(
         # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
+        # 9.5 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
         # 10. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
@@ -1125,11 +1128,6 @@ def __call__(
                 if num_channels_unet == 9:
                     latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
 
-                if ip_adapter_image is not None:
-                    added_cond_kwargs = {"image_embeds": image_embeds}
-                else:
-                    added_cond_kwargs = None
-
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,

From eda593bf4424c6a3f74b5ec7de63ad318f5bdccd Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Nov 2023 13:09:39 +0100
Subject: [PATCH 082/139] lcm add tests

---
 .../alt_diffusion/pipeline_alt_diffusion_img2img.py      | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index d1cca2a5c582..039b5c3f1dd7 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -145,6 +145,7 @@ class AltDiffusionImg2ImgPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
     _exclude_from_cpu_offload = ["safety_checker"]
@@ -871,6 +872,9 @@ def __call__(
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
+        # 6.5 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
         # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
@@ -880,11 +884,6 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-                if ip_adapter_image is not None:
-                    added_cond_kwargs = {"image_embeds": image_embeds}
-                else:
-                    added_cond_kwargs = None
-
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,

From 5c838e428b3c5c60becaf05b0482e16932b419bc Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Nov 2023 13:10:54 +0100
Subject: [PATCH 083/139] Add co-author

Co-authored-by: Okotaku <to78314910@gmail.com>

From 7183b1514588282cf6c7b2f59af4ef65257e2879 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Nov 2023 13:26:17 +0100
Subject: [PATCH 084/139] lcm add tests

---
 .../alt_diffusion/pipeline_alt_diffusion.py   | 39 +++++++++++++++++--
 .../pipeline_stable_diffusion_xl.py           |  2 +-
 .../pipeline_stable_diffusion_xl_img2img.py   |  2 +-
 .../pipeline_stable_diffusion_xl_inpaint.py   |  2 +-
 4 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index a73dc22a146c..d55bb853a390 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -17,16 +17,17 @@
 
 import torch
 from packaging import version
-from transformers import CLIPImageProcessor, XLMRobertaTokenizer
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, XLMRobertaTokenizer
 
 from ...configuration_utils import FrozenDict
 from ...image_processor import VaeImageProcessor
-from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     USE_PEFT_BACKEND,
+    PipelineImageInput,
     deprecate,
     logging,
     replace_example_docstring,
@@ -74,7 +75,9 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
 
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker
-class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class AltDiffusionPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+):
     r"""
     Pipeline for text-to-image generation using Alt Diffusion.
 
@@ -86,6 +89,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
     Args:
         vae ([`AutoencoderKL`]):
@@ -108,7 +112,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
     """
 
     model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor"]
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
     _exclude_from_cpu_offload = ["safety_checker"]
     _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
 
@@ -121,6 +125,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
         requires_safety_checker: bool = True,
     ):
         super().__init__()
@@ -197,6 +202,7 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -447,6 +453,19 @@ def encode_prompt(
 
         return prompt_embeds, negative_prompt_embeds
 
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is None:
             has_nsfw_concept = None
@@ -631,6 +650,7 @@ def __call__(
         latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -677,6 +697,7 @@ def __call__(
             negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated image. Choose between `PIL.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
@@ -778,12 +799,18 @@ def __call__(
             lora_scale=lora_scale,
             clip_skip=self.clip_skip,
         )
+
         # For classifier free guidance, we need to do two forward passes.
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
         if self.do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
@@ -804,6 +831,9 @@ def __call__(
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
+        # 6.5 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
         # 7. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
@@ -819,6 +849,7 @@ def __call__(
                     t,
                     encoder_hidden_states=prompt_embeds,
                     cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
                     return_dict=False,
                 )[0]
 
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 524b7b3d8ffb..4d903a47ab5a 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -477,7 +477,7 @@ def encode_prompt(
 
         return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.encode_image
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
     def encode_image(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
 
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index a50bbd14d8d7..c426e2c16baf 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -690,7 +690,7 @@ def prepare_latents(
 
         return latents
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.encode_image
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
     def encode_image(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
 
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 5db74cb089b0..43e04e6d87e5 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -411,7 +411,7 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.encode_image
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
     def encode_image(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
 

From 0b15eb16d53f624aad6ef165652e2560180369aa Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Nov 2023 12:29:34 +0000
Subject: [PATCH 085/139] uP

---
 docs/source/en/using-diffusers/loading_adapters.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md
index ac3f841b9246..c94badf22e01 100644
--- a/docs/source/en/using-diffusers/loading_adapters.md
+++ b/docs/source/en/using-diffusers/loading_adapters.md
@@ -314,6 +314,8 @@ image
 
 You can find the officially available IP-Adapter checkpoints in [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter).
 
+The pipeline was contributed by [okotaku](https://github.com/okotaku).
+
 Let's look at an example where we use IP-Adapter with the Stable Diffusion text-to-image pipeline. 
 
 ``` py
@@ -441,4 +443,4 @@ image.save("sdxl_t2i.png")
         <td><img src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/sdxl_t2i.png" alt="Adapted Image"></td>
     </tr>
     </table>
-</div>
\ No newline at end of file
+</div>

From eec02db603e2a0c288605683be7e434e1fd4ac67 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 18:09:13 +0530
Subject: [PATCH 086/139] fix

---
 .../pipelines/alt_diffusion/pipeline_alt_diffusion.py          | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index bad132845ca1..1ed6f48d109b 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -20,14 +20,13 @@
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, XLMRobertaTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...image_processor import VaeImageProcessor
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     USE_PEFT_BACKEND,
-    PipelineImageInput,
     deprecate,
     logging,
     replace_example_docstring,

From 819ed61a90f46362bca42fc5594e9c9471e4675c Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 18:10:49 +0530
Subject: [PATCH 087/139] be explicit about @okotaku

---
 docs/source/en/using-diffusers/loading_adapters.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md
index c94badf22e01..807be8b0dc10 100644
--- a/docs/source/en/using-diffusers/loading_adapters.md
+++ b/docs/source/en/using-diffusers/loading_adapters.md
@@ -18,6 +18,8 @@ There are several [training](../training/overview) techniques for personalizing
 
 This guide will show you how to load DreamBooth, textual inversion, and LoRA weights.
 
+Thanks to [okotaku](https://github.com/okotaku) who contributed this features with some guidance from [Yiyi](https://github.com/yiyixuxu) and [Sayak](https://github.com/sayakpaul).
+
 <Tip>
 
 Feel free to browse the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer), [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer), and the [Diffusers Models Gallery](https://huggingface.co/spaces/huggingface-projects/diffusers-gallery) for checkpoints and embeddings to use.

From 6e52db736d39b1b1efe785c83469521b05e8a9a1 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Nov 2023 14:21:12 +0100
Subject: [PATCH 088/139] Apply suggestions from code review

---
 .../pipelines/alt_diffusion/pipeline_alt_diffusion.py         | 2 +-
 .../pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py | 2 +-
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py   | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py     | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_inpaint.py     | 2 +-
 .../stable_diffusion_xl/pipeline_stable_diffusion_xl.py       | 4 ++--
 .../pipeline_stable_diffusion_xl_img2img.py                   | 4 ++--
 .../pipeline_stable_diffusion_xl_inpaint.py                   | 4 ++--
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index 1ed6f48d109b..a16096afc39f 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -124,7 +124,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection,
+        image_encoder: CLIPVisionModelWithProjection = None,
         requires_safety_checker: bool = True,
     ):
         super().__init__()
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 38d2a06ee1bb..d870568ec151 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -160,7 +160,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection,
+        image_encoder: CLIPVisionModelWithProjection = None,
         requires_safety_checker: bool = True,
     ):
         super().__init__()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 12d769c4288e..75fc1b1b3f0e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -119,7 +119,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection,
+        image_encoder: CLIPVisionModelWithProjection = None,
         requires_safety_checker: bool = True,
     ):
         super().__init__()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 0a5eaa9647da..81f28538058d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -154,7 +154,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection,
+        image_encoder: CLIPVisionModelWithProjection = None,
         requires_safety_checker: bool = True,
     ):
         super().__init__()
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 79bfe8d552d4..9f844d96b542 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -217,7 +217,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection,
+        image_encoder: CLIPVisionModelWithProjection = None,
         requires_safety_checker: bool = True,
     ):
         super().__init__()
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 4d903a47ab5a..2e4d827f7e7f 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -179,8 +179,8 @@ def __init__(
         tokenizer_2: CLIPTokenizer,
         unet: UNet2DConditionModel,
         scheduler: KarrasDiffusionSchedulers,
-        image_encoder: CLIPVisionModelWithProjection,
-        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None
         force_zeros_for_empty_prompt: bool = True,
         add_watermarker: Optional[bool] = None,
     ):
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index c426e2c16baf..f6cfa357b984 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -195,8 +195,8 @@ def __init__(
         tokenizer: CLIPTokenizer,
         tokenizer_2: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        image_encoder: CLIPVisionModelWithProjection,
-        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None
         scheduler: KarrasDiffusionSchedulers,
         requires_aesthetics_score: bool = False,
         force_zeros_for_empty_prompt: bool = True,
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 43e04e6d87e5..7e10611344a6 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -343,8 +343,8 @@ def __init__(
         tokenizer: CLIPTokenizer,
         tokenizer_2: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        image_encoder: CLIPVisionModelWithProjection,
-        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None
         scheduler: KarrasDiffusionSchedulers,
         requires_aesthetics_score: bool = False,
         force_zeros_for_empty_prompt: bool = True,

From d9d767211564c938d251c0a70c5ae05d336bea1b Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 18:54:22 +0530
Subject: [PATCH 089/139] remove mention

---
 docs/source/en/using-diffusers/loading_adapters.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md
index 807be8b0dc10..476515146b0c 100644
--- a/docs/source/en/using-diffusers/loading_adapters.md
+++ b/docs/source/en/using-diffusers/loading_adapters.md
@@ -18,8 +18,6 @@ There are several [training](../training/overview) techniques for personalizing
 
 This guide will show you how to load DreamBooth, textual inversion, and LoRA weights.
 
-Thanks to [okotaku](https://github.com/okotaku) who contributed this features with some guidance from [Yiyi](https://github.com/yiyixuxu) and [Sayak](https://github.com/sayakpaul).
-
 <Tip>
 
 Feel free to browse the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer), [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer), and the [Diffusers Models Gallery](https://huggingface.co/spaces/huggingface-projects/diffusers-gallery) for checkpoints and embeddings to use.
@@ -316,7 +314,7 @@ image
 
 You can find the officially available IP-Adapter checkpoints in [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter).
 
-The pipeline was contributed by [okotaku](https://github.com/okotaku).
+IP-Adapter was contributed by [okotaku](https://github.com/okotaku).
 
 Let's look at an example where we use IP-Adapter with the Stable Diffusion text-to-image pipeline. 
 

From f35ce5be5618fca933a0ae242f6669ab37d17e5c Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 9 Nov 2023 18:58:42 +0530
Subject: [PATCH 090/139] Apply suggestions from code review

---
 docs/source/en/using-diffusers/loading_adapters.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md
index 476515146b0c..0a84f9dc48a9 100644
--- a/docs/source/en/using-diffusers/loading_adapters.md
+++ b/docs/source/en/using-diffusers/loading_adapters.md
@@ -318,7 +318,7 @@ IP-Adapter was contributed by [okotaku](https://github.com/okotaku).
 
 Let's look at an example where we use IP-Adapter with the Stable Diffusion text-to-image pipeline. 
 
-``` py
+```py
 from diffusers import AutoPipelineForText2Image, CLIPVisionModelWithProjection
 import torch
 from diffusers.utils import load_image

From 49234b16dda45bc8ccaff8427a5e5610c250f346 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Nov 2023 13:53:20 +0000
Subject: [PATCH 091/139] uP

---
 .../stable_diffusion_xl/pipeline_stable_diffusion_xl.py       | 2 +-
 .../pipeline_stable_diffusion_xl_img2img.py                   | 4 ++--
 .../pipeline_stable_diffusion_xl_inpaint.py                   | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 2e4d827f7e7f..0e26e135e911 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -180,7 +180,7 @@ def __init__(
         unet: UNet2DConditionModel,
         scheduler: KarrasDiffusionSchedulers,
         image_encoder: CLIPVisionModelWithProjection = None,
-        feature_extractor: CLIPImageProcessor = None
+        feature_extractor: CLIPImageProcessor = None,
         force_zeros_for_empty_prompt: bool = True,
         add_watermarker: Optional[bool] = None,
     ):
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index f6cfa357b984..51bc6f31cf02 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -195,9 +195,9 @@ def __init__(
         tokenizer: CLIPTokenizer,
         tokenizer_2: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        image_encoder: CLIPVisionModelWithProjection = None,
-        feature_extractor: CLIPImageProcessor = None
         scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
         requires_aesthetics_score: bool = False,
         force_zeros_for_empty_prompt: bool = True,
         add_watermarker: Optional[bool] = None,
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 7e10611344a6..37c02a279ff1 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -343,9 +343,9 @@ def __init__(
         tokenizer: CLIPTokenizer,
         tokenizer_2: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        image_encoder: CLIPVisionModelWithProjection = None,
-        feature_extractor: CLIPImageProcessor = None
         scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
         requires_aesthetics_score: bool = False,
         force_zeros_for_empty_prompt: bool = True,
         add_watermarker: Optional[bool] = None,

From e9cdb69835b26689fb83464e2f8e896885c13852 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Nov 2023 14:34:26 +0000
Subject: [PATCH 092/139] uP

---
 src/diffusers/pipelines/pipeline_utils.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 6437732d0315..ce5bba1323a8 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -831,7 +831,9 @@ def module_is_offloaded(module):
                 f"It seems like you have activated model offloading by calling `enable_model_cpu_offload`, but are now manually moving the pipeline to GPU. It is strongly recommended against doing so as memory gains from offloading are likely to be lost. Offloading automatically takes care of moving the individual components {', '.join(self.components.keys())} to GPU when needed. To make sure offloading works as expected, you should consider moving the pipeline back to CPU: `pipeline.to('cpu')` or removing the move altogether if you use offloading."
             )
 
-        module_names, _ = self._get_signature_keys(self)
+        expected_modules, optional_parameters = self._get_signature_keys(self)
+        module_names = list(expected_modules) + list(optional_parameters)
+
         modules = [getattr(self, n, None) for n in module_names]
         modules = [m for m in modules if isinstance(m, torch.nn.Module)]
 
@@ -872,7 +874,9 @@ def device(self) -> torch.device:
         Returns:
             `torch.device`: The torch device on which the pipeline is located.
         """
-        module_names, _ = self._get_signature_keys(self)
+        expected_modules, optional_parameters = self._get_signature_keys(self)
+        module_names = list(expected_modules) + list(optional_parameters)
+
         modules = [getattr(self, n, None) for n in module_names]
         modules = [m for m in modules if isinstance(m, torch.nn.Module)]
 
@@ -887,7 +891,9 @@ def dtype(self) -> torch.dtype:
         Returns:
             `torch.dtype`: The torch dtype on which the pipeline is located.
         """
-        module_names, _ = self._get_signature_keys(self)
+        expected_modules, optional_parameters = self._get_signature_keys(self)
+        module_names = list(expected_modules) + list(optional_parameters)
+
         modules = [getattr(self, n, None) for n in module_names]
         modules = [m for m in modules if isinstance(m, torch.nn.Module)]
 
@@ -2017,7 +2023,8 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             for child in module.children():
                 fn_recursive_set_mem_eff(child)
 
-        module_names, _ = self._get_signature_keys(self)
+        expected_modules, optional_parameters = self._get_signature_keys(self)
+        module_names = list(expected_modules) + list(optional_parameters)
         modules = [getattr(self, n, None) for n in module_names]
         modules = [m for m in modules if isinstance(m, torch.nn.Module)]
 
@@ -2073,7 +2080,9 @@ def disable_attention_slicing(self):
         self.enable_attention_slicing(None)
 
     def set_attention_slice(self, slice_size: Optional[int]):
-        module_names, _ = self._get_signature_keys(self)
+        expected_modules, optional_parameters = self._get_signature_keys(self)
+        module_names = list(expected_modules) + list(optional_parameters)
+
         modules = [getattr(self, n, None) for n in module_names]
         modules = [m for m in modules if isinstance(m, torch.nn.Module) and hasattr(m, "set_attention_slice")]
 

From 2ecbc447f478f8a0a16d346e99187670d253222d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Nov 2023 14:50:52 +0000
Subject: [PATCH 093/139] uP

---
 src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py | 1 -
 .../pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py   | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index c973bbb9bcbf..743cdcb6bee9 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -858,7 +858,6 @@ def __call__(
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-
         # 6.1 Add image embeds for IP-Adapter
         added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
 
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index e9ae89c28143..df6c887b1a89 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -902,7 +902,7 @@ def __call__(
 
         # 7.1 Add image embeds for IP-Adapter
         added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-        
+
         # 7.2 Optionally get Guidance Scale Embedding
         timestep_cond = None
         if self.unet.config.time_cond_proj_dim is not None:

From 584138cb2d6bdf618b156f894d1d8d8be9bfe0e2 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Nov 2023 15:35:43 +0000
Subject: [PATCH 094/139] uP

---
 src/diffusers/pipelines/pipeline_utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index ce5bba1323a8..95fac8b055b8 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -542,7 +542,7 @@ def register_modules(self, **kwargs):
 
         for name, module in kwargs.items():
             # retrieve library
-            if module is None:
+            if module is None or isinstance(module, (tuple, list)) and module[0] is None:
                 register_dict = {name: (None, None)}
             else:
                 # register the config from the original module, not the dynamo compiled one
@@ -551,7 +551,12 @@ def register_modules(self, **kwargs):
                 else:
                     not_compiled_module = module
 
-                library = not_compiled_module.__module__.split(".")[0]
+                try:
+                    library = not_compiled_module.__module__.split(".")[0]
+                except:
+                    import ipdb
+
+                    ipdb.set_trace()
 
                 # check if the module is a pipeline module
                 module_path_items = not_compiled_module.__module__.split(".")

From 82f0cc9acf2a003e97f5b9ba5056905af27ea912 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Nov 2023 15:36:26 +0000
Subject: [PATCH 095/139] uP

---
 src/diffusers/pipelines/pipeline_utils.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 95fac8b055b8..e7d0aaa450e9 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -551,12 +551,7 @@ def register_modules(self, **kwargs):
                 else:
                     not_compiled_module = module
 
-                try:
-                    library = not_compiled_module.__module__.split(".")[0]
-                except:
-                    import ipdb
-
-                    ipdb.set_trace()
+                library = not_compiled_module.__module__.split(".")[0]
 
                 # check if the module is a pipeline module
                 module_path_items = not_compiled_module.__module__.split(".")

From 9471dd9baa67ce56acf7581ae5ecda98bf8cc7bf Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Nov 2023 17:53:02 +0000
Subject: [PATCH 096/139] more debug

---
 src/diffusers/pipelines/pipeline_utils.py | 15 ++++++++++++---
 tests/pipelines/test_pipelines_common.py  |  1 +
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index e7d0aaa450e9..b6ccd3d793a8 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1932,9 +1932,18 @@ def components(self) -> Dict[str, Any]:
         ```
         """
         expected_modules, optional_parameters = self._get_signature_keys(self)
-        components = {
-            k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters
-        }
+        for name in optional_parameters:
+            if name in self._optional_components:
+                expected_modules.add(name)
+
+        def is_component(name):
+            if name.startswith("_"):
+                return False
+            if name in optional_parameters and name not in self._optional_components:
+                return False
+            return True
+
+        components = {k: getattr(self, k) for k in self.config.keys() if is_component(k)}
 
         if set(components.keys()) != expected_modules:
             raise ValueError(
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index aff91a9589a6..d85d20d22844 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -331,6 +331,7 @@ def test_save_load_local(self, expected_max_difference=5e-4):
         with tempfile.TemporaryDirectory() as tmpdir:
             pipe.save_pretrained(tmpdir, safe_serialization=False)
 
+            import ipdb; ipdb.set_trace()
             with CaptureLogger(logger) as cap_logger:
                 pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
 

From 7ecfcfe42e6b83d53e2ff132b272fbfa584c68eb Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Thu, 9 Nov 2023 12:10:22 -1000
Subject: [PATCH 097/139] Apply suggestions from code review

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 .../en/using-diffusers/loading_adapters.md    | 42 ++++++++++---------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md
index 0a84f9dc48a9..6372d0f568d9 100644
--- a/docs/source/en/using-diffusers/loading_adapters.md
+++ b/docs/source/en/using-diffusers/loading_adapters.md
@@ -308,15 +308,21 @@ image = pipeline(prompt=prompt).images[0]
 image
 ```
 
-### IP-Adapter 
+## IP-Adapter 
 
-[IP-Adapter](https://ip-adapter.github.io/) is an effective and lightweight adapter to achieve image prompt capability for the pre-trained text-to-image diffusion models. It is now available to use with most of our Stable Diffusion and Stable Diffusion XL pipelines. You can also use the IP-Adapter with other custom models fine-tuned from the same base model, as well as ControlNet and T2I adapters. Moreover, the image prompt can also work well with the text prompt to accomplish multimodal image generation.
+[IP-Adapter](https://ip-adapter.github.io/) is an effective and lightweight adapter that adds image prompting capabilities to a diffusion model. This adapter works by decoupling the cross-attention layers of the image and text features. All the other model components are frozen and only the embedded image features in the UNet are trained. As a result, IP-Adapter files are typically only ~100MBs.
 
-You can find the officially available IP-Adapter checkpoints in [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter).
+IP-Adapter works with most of our Stable Diffusion, Stable Diffusion XL (SDXL), ControlNet, T2I-Adapter, and any custom models finetuned from the same base models.
+
+<Tip>
+
+You can find official IP-Adapter checkpoints in [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter).
 
 IP-Adapter was contributed by [okotaku](https://github.com/okotaku).
 
-Let's look at an example where we use IP-Adapter with the Stable Diffusion text-to-image pipeline. 
+</Tip>
+
+IP-Adapter relies on an image encoder to generate the image features, so let's load a [`~transformers.CLIPVisionModelWithProjection`] model and then pass it to a Stable Diffusion pipeline.
 
 ```py
 from diffusers import AutoPipelineForText2Image, CLIPVisionModelWithProjection
@@ -332,13 +338,13 @@ image_encoder = CLIPVisionModelWithProjection.from_pretrained(
 pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, torch_dtype=torch.float16).to("cuda")
 ```
 
-Now you can load the IP-Adapter with [`~loaders.IPAdapterMixin.load_ip_adapter`] method. 
+Now load the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) weights with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method. 
 
 ```py
 pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
 ```
 
-IP-Adapter allows you to use both image and text to condition the image generation process. In this example, let's take the cute bear eating pizza that we generated with Textual Inversion, and create a new bear that is similarly cute but wears sunglasses. We can pass the bear image as `ip_adapter_image`, along with a text prompt that mentions "sunglasses". 
+IP-Adapter allows you to use both image and text to condition the image generation process. For example, let's use the bear image from the [Textual Inversion](#textual-inversion) section as the image prompt (`ip_adapter_image`) along with a text prompt to add "sunglasses". 😎
 
 ```py
 pipeline.set_ip_adapter_scale(0.6)
@@ -360,7 +366,7 @@ images[0]
 
 <Tip>
 
-You can use the `pipeline.set_ip_adapter_scale()` method to adjust the ratio of text prompt and image prompt condition.  If you only use the image prompt, you should set the scale to be `1.0`. You can lower the scale to get more diversity in the generation, at the cost of less prompt alignment.
+You can use the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method to adjust the text prompt and image prompt condition ratio.  If you're only using the image prompt, you should set the scale to `1.0`. You can lower the scale to get more generation diversity, but it'll be less aligned with the prompt.
 `scale=0.5` can achieve good results in most cases when you use both text and image prompts.
 </Tip>
 
@@ -395,7 +401,7 @@ images = pipeline(
 images[0]
 ```
 
-IP-Adapters can be used with [Stable Diffusion XL](../api/pipelines/stable_diffusion/stable_diffusion_xl.md) (SDXL) for text-to-image, image-to-image, and inpainting pipelines. Below is an example for SDXL text-to-image.
+IP-Adapters can also be used with [SDXL](../api/pipelines/stable_diffusion/stable_diffusion_xl.md), but you'll also need to load a [`~transformers.CLIPImageProcessor`] as your feature extractor and pass it to the pipeline.
 
 ```python
 from diffusers import AutoPipelineForText2Image
@@ -432,15 +438,13 @@ image = pipeline(
 image.save("sdxl_t2i.png")
 ```
 
-<div class="flex justify-center">
-   <table border="1">
-    <tr>
-        <th>Input Image</th>
-        <th>Adapted Image</th>
-    </tr>
-    <tr>
-        <td><img src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/watercolor_painting.jpeg" alt="Input Image"></td>
-        <td><img src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/sdxl_t2i.png" alt="Adapted Image"></td>
-    </tr>
-    </table>
+<div class="flex flex-row gap-4">
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/watercolor_painting.jpeg"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">input image</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/sdxl_t2i.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">adapted image</figcaption>
+  </div>
 </div>

From 774f0dd14f1b3ca984cef28d4924d131252d8297 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 10 Nov 2023 07:48:42 +0530
Subject: [PATCH 098/139] style

---
 tests/pipelines/test_pipelines_common.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index d85d20d22844..1d81bf5927bb 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -331,7 +331,9 @@ def test_save_load_local(self, expected_max_difference=5e-4):
         with tempfile.TemporaryDirectory() as tmpdir:
             pipe.save_pretrained(tmpdir, safe_serialization=False)
 
-            import ipdb; ipdb.set_trace()
+            import ipdb
+
+            ipdb.set_trace()
             with CaptureLogger(logger) as cap_logger:
                 pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
 

From 3ab40491f3432d01d506886ef21c7b8a353cc21e Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 10 Nov 2023 17:24:16 +0530
Subject: [PATCH 099/139] remove ipdb

---
 tests/pipelines/test_pipelines_common.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 1d81bf5927bb..aff91a9589a6 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -331,9 +331,6 @@ def test_save_load_local(self, expected_max_difference=5e-4):
         with tempfile.TemporaryDirectory() as tmpdir:
             pipe.save_pretrained(tmpdir, safe_serialization=False)
 
-            import ipdb
-
-            ipdb.set_trace()
             with CaptureLogger(logger) as cap_logger:
                 pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
 

From a106e833af379661a0550dac4604f866b927f0ab Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 10 Nov 2023 17:26:56 +0530
Subject: [PATCH 100/139] style

---
 examples/text_to_image/train_text_to_image_flax.py      | 5 +----
 examples/text_to_image/train_text_to_image_lora_sdxl.py | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py
index 9ebe34555310..e62d03c730b1 100644
--- a/examples/text_to_image/train_text_to_image_flax.py
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -272,10 +272,7 @@ def main():
     if args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         dataset = load_dataset(
-            args.dataset_name,
-            args.dataset_config_name,
-            cache_dir=args.cache_dir,
-            data_dir=args.train_data_dir
+            args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, data_dir=args.train_data_dir
         )
     else:
         data_files = {}
diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py
index 1a6ef0c856db..b69940603128 100644
--- a/examples/text_to_image/train_text_to_image_lora_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py
@@ -765,10 +765,7 @@ def load_model_hook(models, input_dir):
     if args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         dataset = load_dataset(
-            args.dataset_name,
-            args.dataset_config_name,
-            cache_dir=args.cache_dir,
-            data_dir=args.train_data_dir
+            args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, data_dir=args.train_data_dir
         )
     else:
         data_files = {}

From 90f9a5847d11db8739275a15df320ac3b21d6118 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 10 Nov 2023 18:07:11 +0530
Subject: [PATCH 101/139] debug

---
 tests/pipelines/test_pipelines_common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index aff91a9589a6..a5553690d260 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -332,6 +332,7 @@ def test_save_load_local(self, expected_max_difference=5e-4):
             pipe.save_pretrained(tmpdir, safe_serialization=False)
 
             with CaptureLogger(logger) as cap_logger:
+                print(os.listdir(tmpdir))
                 pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
 
             for name in pipe_loaded.components.keys():

From 679bcf310c4582b34a142af198b94ef1cb584f56 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 10 Nov 2023 18:11:54 +0530
Subject: [PATCH 102/139] debug

---
 src/diffusers/pipelines/pipeline_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index b6ccd3d793a8..287e26c211e9 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -502,6 +502,7 @@ def load_sub_model(
             loading_kwargs["low_cpu_mem_usage"] = False
 
     # check if the module is in a subdirectory
+    print(f"From loading module: cached_folder: {cached_folder} name: {name}")
     if os.path.isdir(os.path.join(cached_folder, name)):
         loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs)
     else:

From d43f075d34088582a85200e2d078dc3946d69b0a Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 10 Nov 2023 18:13:46 +0530
Subject: [PATCH 103/139] debug

---
 tests/pipelines/test_pipelines_common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index a5553690d260..f9e4e81d5911 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -333,6 +333,7 @@ def test_save_load_local(self, expected_max_difference=5e-4):
 
             with CaptureLogger(logger) as cap_logger:
                 print(os.listdir(tmpdir))
+                print(os.listdir(f"{tmpdir}/image_encoder"))
                 pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
 
             for name in pipe_loaded.components.keys():

From be3d3e81eda01c83bc9116219f30c0fb64a07632 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 10 Nov 2023 18:16:18 +0530
Subject: [PATCH 104/139] debug

---
 tests/pipelines/test_pipelines_common.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index f9e4e81d5911..e1519810c8ed 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -333,7 +333,8 @@ def test_save_load_local(self, expected_max_difference=5e-4):
 
             with CaptureLogger(logger) as cap_logger:
                 print(os.listdir(tmpdir))
-                print(os.listdir(f"{tmpdir}/image_encoder"))
+                if "image_encoder" in os.listdir(tmpdir):
+                    print(os.listdir(f"{tmpdir}/image_encoder"))
                 pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
 
             for name in pipe_loaded.components.keys():

From 105bd35b56ffd65c94f043ccd8cb867a64ae6719 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 10 Nov 2023 18:21:30 +0530
Subject: [PATCH 105/139] debug

---
 tests/pipelines/test_pipelines_common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index e1519810c8ed..822c7a761169 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -329,6 +329,7 @@ def test_save_load_local(self, expected_max_difference=5e-4):
         logger.setLevel(diffusers.logging.INFO)
 
         with tempfile.TemporaryDirectory() as tmpdir:
+            print(pipe.components.keys())
             pipe.save_pretrained(tmpdir, safe_serialization=False)
 
             with CaptureLogger(logger) as cap_logger:

From 1a28c329c6d01305d5453c79df1a3f0564245d0d Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 10 Nov 2023 18:24:12 +0530
Subject: [PATCH 106/139] debug

---
 src/diffusers/pipelines/pipeline_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 287e26c211e9..59df66ed452e 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -647,6 +647,7 @@ def is_saveable_module(name, value):
 
         model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)}
         for pipeline_component_name in model_index_dict.keys():
+            print(f"From save_pretrained: {pipeline_component_name}")
             sub_model = getattr(self, pipeline_component_name)
             model_cls = sub_model.__class__
 
@@ -682,6 +683,7 @@ def is_saveable_module(name, value):
                 continue
 
             save_method = getattr(sub_model, save_method_name)
+            print(f"save_method: {save_method}")
 
             # Call the save method with the argument safe_serialization only if it's supported
             save_method_signature = inspect.signature(save_method)

From dc7681681ba1e5626033368601494e4edcf49211 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 10 Nov 2023 18:27:06 +0530
Subject: [PATCH 107/139] debug

---
 src/diffusers/pipelines/pipeline_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 59df66ed452e..02fa9c2d6a0c 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -683,8 +683,7 @@ def is_saveable_module(name, value):
                 continue
 
             save_method = getattr(sub_model, save_method_name)
-            print(f"save_method: {save_method}")
-
+            
             # Call the save method with the argument safe_serialization only if it's supported
             save_method_signature = inspect.signature(save_method)
             save_method_accept_safe = "safe_serialization" in save_method_signature.parameters

From f06ba21d3eaf4a25d24b75438e859e482144ba68 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 10 Nov 2023 18:28:29 +0530
Subject: [PATCH 108/139] debug

---
 src/diffusers/pipelines/pipeline_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 02fa9c2d6a0c..4f2bfa51a9b8 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -646,6 +646,7 @@ def is_saveable_module(name, value):
             return True
 
         model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)}
+        print(f"From save_pretrained: {model_index_dict.keys()}")
         for pipeline_component_name in model_index_dict.keys():
             print(f"From save_pretrained: {pipeline_component_name}")
             sub_model = getattr(self, pipeline_component_name)
@@ -683,7 +684,7 @@ def is_saveable_module(name, value):
                 continue
 
             save_method = getattr(sub_model, save_method_name)
-            
+
             # Call the save method with the argument safe_serialization only if it's supported
             save_method_signature = inspect.signature(save_method)
             save_method_accept_safe = "safe_serialization" in save_method_signature.parameters

From af88728c6af75177bf64d7ceb86b55e0afd64160 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 10 Nov 2023 13:36:03 +0000
Subject: [PATCH 109/139] more debug

---
 .../pipelines/pipeline_flax_utils.py          | 11 ++++-
 src/diffusers/pipelines/pipeline_utils.py     | 45 +++++++------------
 tests/pipelines/test_pipelines_common.py      |  1 +
 3 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_flax_utils.py b/src/diffusers/pipelines/pipeline_flax_utils.py
index 7b067405cace..87e6fc6f7c67 100644
--- a/src/diffusers/pipelines/pipeline_flax_utils.py
+++ b/src/diffusers/pipelines/pipeline_flax_utils.py
@@ -537,12 +537,19 @@ def load_module(name, value):
         model = pipeline_class(**init_kwargs, dtype=dtype)
         return model, params
 
-    @staticmethod
-    def _get_signature_keys(obj):
+    @classmethod
+    def _get_signature_keys(cls, obj):
         parameters = inspect.signature(obj.__init__).parameters
         required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
         optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
         expected_modules = set(required_parameters.keys()) - {"self"}
+
+        optional_names = list(optional_parameters)
+        for name in optional_names:
+            if name in cls._optional_components:
+                expected_modules.add(name)
+                optional_parameters.remove(name)
+
         return expected_modules, optional_parameters
 
     @property
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index b6ccd3d793a8..f992b95392e2 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -831,9 +831,7 @@ def module_is_offloaded(module):
                 f"It seems like you have activated model offloading by calling `enable_model_cpu_offload`, but are now manually moving the pipeline to GPU. It is strongly recommended against doing so as memory gains from offloading are likely to be lost. Offloading automatically takes care of moving the individual components {', '.join(self.components.keys())} to GPU when needed. To make sure offloading works as expected, you should consider moving the pipeline back to CPU: `pipeline.to('cpu')` or removing the move altogether if you use offloading."
             )
 
-        expected_modules, optional_parameters = self._get_signature_keys(self)
-        module_names = list(expected_modules) + list(optional_parameters)
-
+        module_names, _ = self._get_signature_keys(self)
         modules = [getattr(self, n, None) for n in module_names]
         modules = [m for m in modules if isinstance(m, torch.nn.Module)]
 
@@ -874,9 +872,7 @@ def device(self) -> torch.device:
         Returns:
             `torch.device`: The torch device on which the pipeline is located.
         """
-        expected_modules, optional_parameters = self._get_signature_keys(self)
-        module_names = list(expected_modules) + list(optional_parameters)
-
+        module_names, _ = self._get_signature_keys(self)
         modules = [getattr(self, n, None) for n in module_names]
         modules = [m for m in modules if isinstance(m, torch.nn.Module)]
 
@@ -891,9 +887,7 @@ def dtype(self) -> torch.dtype:
         Returns:
             `torch.dtype`: The torch dtype on which the pipeline is located.
         """
-        expected_modules, optional_parameters = self._get_signature_keys(self)
-        module_names = list(expected_modules) + list(optional_parameters)
-
+        module_names, _ = self._get_signature_keys(self)
         modules = [getattr(self, n, None) for n in module_names]
         modules = [m for m in modules if isinstance(m, torch.nn.Module)]
 
@@ -1900,12 +1894,19 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
                     " above."
                 ) from model_info_call_error
 
-    @staticmethod
-    def _get_signature_keys(obj):
+    @classmethod
+    def _get_signature_keys(cls, obj):
         parameters = inspect.signature(obj.__init__).parameters
         required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
         optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
         expected_modules = set(required_parameters.keys()) - {"self"}
+
+        optional_names = list(optional_parameters)
+        for name in optional_names:
+            if name in cls._optional_components:
+                expected_modules.add(name)
+                optional_parameters.remove(name)
+
         return expected_modules, optional_parameters
 
     @property
@@ -1932,18 +1933,9 @@ def components(self) -> Dict[str, Any]:
         ```
         """
         expected_modules, optional_parameters = self._get_signature_keys(self)
-        for name in optional_parameters:
-            if name in self._optional_components:
-                expected_modules.add(name)
-
-        def is_component(name):
-            if name.startswith("_"):
-                return False
-            if name in optional_parameters and name not in self._optional_components:
-                return False
-            return True
-
-        components = {k: getattr(self, k) for k in self.config.keys() if is_component(k)}
+        components = {
+            k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters
+        }
 
         if set(components.keys()) != expected_modules:
             raise ValueError(
@@ -2032,8 +2024,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             for child in module.children():
                 fn_recursive_set_mem_eff(child)
 
-        expected_modules, optional_parameters = self._get_signature_keys(self)
-        module_names = list(expected_modules) + list(optional_parameters)
+        module_names, _ = self._get_signature_keys(self)
         modules = [getattr(self, n, None) for n in module_names]
         modules = [m for m in modules if isinstance(m, torch.nn.Module)]
 
@@ -2089,9 +2080,7 @@ def disable_attention_slicing(self):
         self.enable_attention_slicing(None)
 
     def set_attention_slice(self, slice_size: Optional[int]):
-        expected_modules, optional_parameters = self._get_signature_keys(self)
-        module_names = list(expected_modules) + list(optional_parameters)
-
+        module_names, _ = self._get_signature_keys(self)
         modules = [getattr(self, n, None) for n in module_names]
         modules = [m for m in modules if isinstance(m, torch.nn.Module) and hasattr(m, "set_attention_slice")]
 
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index d85d20d22844..4c5d5118f9c2 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -521,6 +521,7 @@ def test_components_function(self):
         pipe = self.pipeline_class(**init_components)
 
         self.assertTrue(hasattr(pipe, "components"))
+        import ipdb; ipdb.set_trace()
         self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
 
     @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")

From 087417ce2a250ad2b578a1eca70dc85cc4edd643 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 10 Nov 2023 13:44:04 +0000
Subject: [PATCH 110/139] more debug

---
 tests/pipelines/altdiffusion/test_alt_diffusion.py | 1 +
 tests/pipelines/test_pipelines_common.py           | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py
index 5befe60cf6d9..728f745bd753 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py
@@ -117,6 +117,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None
         }
         return components
 
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 1cefa8cb193e..822c7a761169 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -524,7 +524,6 @@ def test_components_function(self):
         pipe = self.pipeline_class(**init_components)
 
         self.assertTrue(hasattr(pipe, "components"))
-        import ipdb; ipdb.set_trace()
         self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
 
     @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")

From f4a04c0c454d96a74d32d4f4940cb33790333897 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 10 Nov 2023 13:44:14 +0000
Subject: [PATCH 111/139] more debug

---
 tests/pipelines/altdiffusion/test_alt_diffusion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py
index 728f745bd753..b4a2847bb84d 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py
@@ -117,7 +117,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
-            "image_encoder": None
+            "image_encoder": None,
         }
         return components
 

From 5e4b53d09d20031296244ec085a2d004b1bb7d0e Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 10 Nov 2023 16:49:26 +0100
Subject: [PATCH 112/139] Apply suggestions from code review

---
 tests/pipelines/test_pipelines_auto.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/pipelines/test_pipelines_auto.py b/tests/pipelines/test_pipelines_auto.py
index 4bac90f2091e..1cd29565b8de 100644
--- a/tests/pipelines/test_pipelines_auto.py
+++ b/tests/pipelines/test_pipelines_auto.py
@@ -116,7 +116,7 @@ def test_from_pipe_controlnet_text2img(self):
         assert pipe.__class__.__name__ == "StableDiffusionControlNetPipeline"
         assert "controlnet" in pipe.components
 
-        pipe = AutoPipelineForText2Image.from_pipe(pipe, controlnet=None, image_encoder=None)
+        pipe = AutoPipelineForText2Image.from_pipe(pipe, controlnet=None)
         assert pipe.__class__.__name__ == "StableDiffusionPipeline"
         assert "controlnet" not in pipe.components
 
@@ -128,7 +128,7 @@ def test_from_pipe_controlnet_img2img(self):
         assert pipe.__class__.__name__ == "StableDiffusionControlNetImg2ImgPipeline"
         assert "controlnet" in pipe.components
 
-        pipe = AutoPipelineForImage2Image.from_pipe(pipe, controlnet=None, image_encoder=None)
+        pipe = AutoPipelineForImage2Image.from_pipe(pipe, controlnet=None)
         assert pipe.__class__.__name__ == "StableDiffusionImg2ImgPipeline"
         assert "controlnet" not in pipe.components
 
@@ -140,7 +140,7 @@ def test_from_pipe_controlnet_inpaint(self):
         assert pipe.__class__.__name__ == "StableDiffusionControlNetInpaintPipeline"
         assert "controlnet" in pipe.components
 
-        pipe = AutoPipelineForInpainting.from_pipe(pipe, controlnet=None, image_encoder=None)
+        pipe = AutoPipelineForInpainting.from_pipe(pipe, controlnet=None)
         assert pipe.__class__.__name__ == "StableDiffusionInpaintPipeline"
         assert "controlnet" not in pipe.components
 
@@ -152,7 +152,7 @@ def test_from_pipe_controlnet_new_task(self):
         assert pipe_control_img2img.__class__.__name__ == "StableDiffusionControlNetImg2ImgPipeline"
         assert "controlnet" in pipe_control_img2img.components
 
-        pipe_inpaint = AutoPipelineForInpainting.from_pipe(pipe_control_img2img, controlnet=None, image_encoder=None)
+        pipe_inpaint = AutoPipelineForInpainting.from_pipe(pipe_control_img2img, controlnet=None)
         assert pipe_inpaint.__class__.__name__ == "StableDiffusionInpaintPipeline"
         assert "controlnet" not in pipe_inpaint.components
 

From 286cb1a95418282c1447b248dce40e7b2772edb7 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 10 Nov 2023 18:19:50 +0100
Subject: [PATCH 113/139] add tests

---
 .../pipelines/pipeline_flax_utils.py          |  6 -----
 tests/pipelines/test_pipelines.py             | 24 ++++++++++++++++++-
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_flax_utils.py b/src/diffusers/pipelines/pipeline_flax_utils.py
index 87e6fc6f7c67..87d388eae3da 100644
--- a/src/diffusers/pipelines/pipeline_flax_utils.py
+++ b/src/diffusers/pipelines/pipeline_flax_utils.py
@@ -544,12 +544,6 @@ def _get_signature_keys(cls, obj):
         optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
         expected_modules = set(required_parameters.keys()) - {"self"}
 
-        optional_names = list(optional_parameters)
-        for name in optional_names:
-            if name in cls._optional_components:
-                expected_modules.add(name)
-                optional_parameters.remove(name)
-
         return expected_modules, optional_parameters
 
     @property
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index 2fd83f86052a..13b45e95479d 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -1254,7 +1254,6 @@ def test_set_component_to_none(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
-            image_encoder=None,
         )
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -1280,6 +1279,29 @@ def test_set_component_to_none(self):
         assert out_image.shape == (1, 64, 64, 3)
         assert np.abs(out_image - out_image_2).max() < 1e-3
 
+    def test_optional_components_is_none(self):
+        unet = self.dummy_cond_unet()
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        items = {
+            "feature_extractor": self.dummy_extractor,
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": bert,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            # we don't add an image encoder
+        }
+
+        pipeline = StableDiffusionPipeline(**items)
+
+        assert sorted(list(pipeline.components.keys())) == sorted(["image_encoder"] + list(items.keys()))
+        assert pipeline.image_encoder is None
+
     def test_set_scheduler_consistency(self):
         unet = self.dummy_cond_unet()
         pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")

From 9ff5f6b50e0b59555cef15634329962d792735bd Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Fri, 10 Nov 2023 11:11:26 -1000
Subject: [PATCH 114/139] Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 tests/models/test_ip_adapters.py         | 5 ++---
 tests/pipelines/test_pipelines.py        | 8 --------
 tests/pipelines/test_pipelines_common.py | 5 -----
 3 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index d40a4653f307..0ffd8af0374e 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -294,7 +294,7 @@ def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False, for_s
         return input_kwargs
 
 
-@nightly
+@slow 
 @require_torch_gpu
 class IPAdapterSDIntegrationTests(IPAdapterNightlyTestsMixin):
     def test_text_to_image(self):
@@ -314,7 +314,6 @@ def test_text_to_image(self):
         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
 
     def test_image_to_image(self):
-        StableDiffusionImg2ImgPipeline
         image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
         pipeline = StableDiffusionImg2ImgPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype
@@ -347,7 +346,7 @@ def test_inpainting(self):
         assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
 
 
-@nightly
+@slow
 @require_torch_gpu
 class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):
     def test_text_to_image_sdxl(self):
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index 13b45e95479d..f77e78674eeb 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -1184,7 +1184,6 @@ def test_pipe_false_offload_warn(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
-            image_encoder=None,
         )
 
         sd.enable_model_cpu_offload()
@@ -1203,7 +1202,6 @@ def test_pipe_false_offload_warn(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
-            image_encoder=None,
         )
 
     def test_set_scheduler(self):
@@ -1221,7 +1219,6 @@ def test_set_scheduler(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
-            image_encoder=None,
         )
 
         sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
@@ -1318,7 +1315,6 @@ def test_set_scheduler_consistency(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
-            image_encoder=None,
         )
 
         pndm_config = sd.scheduler.config
@@ -1337,7 +1333,6 @@ def test_set_scheduler_consistency(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
-            image_encoder=None,
         )
 
         ddim_config = sd.scheduler.config
@@ -1431,7 +1426,6 @@ def test_optional_components(self):
             tokenizer=tokenizer,
             safety_checker=unet,
             feature_extractor=self.dummy_extractor,
-            image_encoder=None,
         )
         sd = orig_sd
 
@@ -1558,7 +1552,6 @@ def test_pipe_to(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
-            image_encoder=None,
         )
 
         device_type = torch.device(torch_device).type
@@ -1620,7 +1613,6 @@ def test_pipe_same_device_id_offload(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
-            image_encoder=None,
         )
 
         sd.enable_model_cpu_offload(gpu_id=5)
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 822c7a761169..b9fe4d190f23 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -329,13 +329,9 @@ def test_save_load_local(self, expected_max_difference=5e-4):
         logger.setLevel(diffusers.logging.INFO)
 
         with tempfile.TemporaryDirectory() as tmpdir:
-            print(pipe.components.keys())
             pipe.save_pretrained(tmpdir, safe_serialization=False)
 
             with CaptureLogger(logger) as cap_logger:
-                print(os.listdir(tmpdir))
-                if "image_encoder" in os.listdir(tmpdir):
-                    print(os.listdir(f"{tmpdir}/image_encoder"))
                 pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
 
             for name in pipe_loaded.components.keys():
@@ -1047,7 +1043,6 @@ def get_pipeline_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
-            "image_encoder": None,
         }
         return components
 

From e8f6a85a57c54cad73582051974b53b6877b162f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 11 Nov 2023 00:41:35 +0000
Subject: [PATCH 115/139] refactor load_ip_adapter: add capabiliity to load
 clip image encoder + processor

---
 src/diffusers/loaders.py                      | 30 +++++++++++++++----
 src/diffusers/models/unet_2d_condition.py     | 18 ++++++-----
 .../versatile_diffusion/modeling_text_unet.py | 19 +++++++-----
 tests/models/test_ip_adapters.py              |  4 +--
 4 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index cfa582d4cc48..6ce40540227f 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -62,7 +62,13 @@
 
 
 if is_transformers_available():
-    from transformers import CLIPTextModel, CLIPTextModelWithProjection, PreTrainedModel
+    from transformers import (
+        CLIPImageProcessor,
+        CLIPTextModel,
+        CLIPTextModelWithProjection,
+        CLIPVisionModelWithProjection,
+        PreTrainedModel,
+    )
 
 if is_accelerate_available():
     from accelerate import init_empty_weights
@@ -3426,9 +3432,6 @@ def load_ip_adapter(
             subfolder (`str`, *optional*, defaults to `""`):
                 The subfolder location of a model file within a larger model repository on the Hub or locally.
         """
-        if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
-            raise ValueError("`image_encoder` cannot be None when using IP Adapters.")
-
         self.set_ip_adapter()
 
         # Load the main state dict first.
@@ -3472,6 +3475,22 @@ def load_ip_adapter(
         if keys != ["image_proj", "ip_adapter"]:
             raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")
 
+        # load CLIP image encoer here if it has not been registered to the pipeline yet
+        if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
+            if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+                logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
+                image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                    pretrained_model_name_or_path_or_dict,
+                    subfolder=os.path.join(subfolder, "image_encoder"),
+                ).to(self.device, dtype=self.dtype)
+                self.image_encoder = image_encoder
+            else:
+                raise ValueError("`image_encoder` cannot be None when using IP Adapters.")
+
+        # create feature extractor if it has not been registered to the pipeline yet
+        if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
+            self.feature_extractor = CLIPImageProcessor()
+
         # Handle image projection layers.
         clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
         cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
@@ -3495,8 +3514,7 @@ def load_ip_adapter(
         image_projection.load_state_dict(diffusers_state_dict)
 
         self.unet.encoder_hid_proj = image_projection.to(device=self.unet.device, dtype=self.unet.dtype)
-        self.unet.config.encoder_hid_dim_type = "image_proj"
-        self.unet.config.encoder_hid_dim = clip_embeddings_dim
+        self.unet.config.encoder_hid_dim_type = "ip_image_proj"
 
         # Handle IP-Adapter cross-attention layers.
         ip_layers = torch.nn.ModuleList(
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index a77d53b98603..dd91d8007229 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -1015,19 +1015,21 @@ def forward(
             image_embeds = added_cond_kwargs.get("image_embeds")
             encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
         elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
             if "image_embeds" not in added_cond_kwargs:
                 raise ValueError(
                     f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
                 )
             image_embeds = added_cond_kwargs.get("image_embeds")
-            image_embeds = self.encoder_hid_proj(image_embeds)
-            # IP-adapter
-            if any("to_k_ip" in k for k in self.state_dict().keys()):
-                image_embeds = image_embeds.to(encoder_hidden_states.dtype)
-                encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
-            else:
-                # Kandinsky 2.2 - style
-                encoder_hidden_states = image_embeds
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds).to(encoder_hidden_states.dtype)
+            encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
 
         # 2. pre-process
         sample = self.conv_in(sample)
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index cab924de4c38..64bec5d2dd63 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -1231,20 +1231,23 @@ def forward(
             image_embeds = added_cond_kwargs.get("image_embeds")
             encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
         elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
             if "image_embeds" not in added_cond_kwargs:
                 raise ValueError(
                     f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires"
                     " the keyword argument `image_embeds` to be passed in  `added_conditions`"
                 )
             image_embeds = added_cond_kwargs.get("image_embeds")
-            image_embeds = self.encoder_hid_proj(image_embeds)
-            # IP-adapter
-            if any("to_k_ip" in k for k in self.state_dict().keys()):
-                image_embeds = image_embeds.to(encoder_hidden_states.dtype)
-                encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
-            else:
-                # Kandinsky 2.2 - style
-                encoder_hidden_states = image_embeds
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which"
+                    " requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds).to(encoder_hidden_states.dtype)
+            encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
 
         # 2. pre-process
         sample = self.conv_in(sample)
diff --git a/tests/models/test_ip_adapters.py b/tests/models/test_ip_adapters.py
index 0ffd8af0374e..e06bf6e8350c 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/models/test_ip_adapters.py
@@ -49,8 +49,8 @@
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
-    nightly,
     require_torch_gpu,
+    slow,
     torch_device,
 )
 
@@ -294,7 +294,7 @@ def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False, for_s
         return input_kwargs
 
 
-@slow 
+@slow
 @require_torch_gpu
 class IPAdapterSDIntegrationTests(IPAdapterNightlyTestsMixin):
     def test_text_to_image(self):

From d50a19f048de2ac9f176f43e5dad2149542718e7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 11 Nov 2023 02:31:56 +0000
Subject: [PATCH 116/139] style

---
 tests/pipelines/test_pipelines.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index f77e78674eeb..d812ce0ccb95 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -1296,7 +1296,7 @@ def test_optional_components_is_none(self):
 
         pipeline = StableDiffusionPipeline(**items)
 
-        assert sorted(list(pipeline.components.keys())) == sorted(["image_encoder"] + list(items.keys()))
+        assert sorted(pipeline.components.keys()) == sorted(["image_encoder"] + list(items.keys()))
         assert pipeline.image_encoder is None
 
     def test_set_scheduler_consistency(self):

From 7e7f1dc27ab827c4d7e268de829a8ef6ff10f936 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 11 Nov 2023 05:27:30 +0000
Subject: [PATCH 117/139] refacotr 2: unet._load_ip_adapter_weights

---
 src/diffusers/loaders.py | 133 ++++++++++++++++++---------------------
 1 file changed, 61 insertions(+), 72 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 6ce40540227f..05dd31e9319d 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -33,8 +33,6 @@
     AttnProcessor2_0,
     IPAdapterAttnProcessor,
     IPAdapterAttnProcessor2_0,
-    IPAdapterControlNetAttnProcessor,
-    IPAdapterControlNetAttnProcessor2_0,
 )
 from .models.embeddings import ImageProjection
 from .models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
@@ -768,6 +766,65 @@ def enable_lora(self):
             raise ValueError("PEFT backend is required for this method.")
         set_adapter_layers(self, enabled=True)
 
+    def _load_ip_adapter_weights(self, state_dict):
+        # set ip-adapter cross-attention processors
+        attn_procs = {}
+        for name in self.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else self.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = self.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_processor_class = (
+                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                )
+                attn_procs[name] = attn_processor_class()
+            else:
+                attn_processor_class = (
+                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
+                )
+                attn_procs[name] = attn_processor_class(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
+                ).to(dtype=self.dtype, device=self.device)
+
+        self.set_attn_processor(attn_procs)
+
+        # load ip-adapter cross-attention weights
+        ip_attn_layers = torch.nn.ModuleList(
+            [module if isinstance(module, nn.Module) else nn.Identity() for module in self.attn_processors.values()]
+        )
+        ip_attn_layers.load_state_dict(state_dict["ip_adapter"])
+
+        # create image projection layers.
+        clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
+        cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
+
+        image_projection = ImageProjection(
+            cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim, num_image_text_embeds=4
+        )
+        image_projection.to(dtype=self.dtype, device=self.device)
+
+        # load image projection layer weights
+        image_proj_state_dict = {}
+        image_proj_state_dict.update(
+            {
+                "image_embeds.weight": state_dict["image_proj"]["proj.weight"],
+                "image_embeds.bias": state_dict["image_proj"]["proj.bias"],
+                "norm.weight": state_dict["image_proj"]["norm.weight"],
+                "norm.bias": state_dict["image_proj"]["norm.bias"],
+            }
+        )
+
+        image_projection.load_state_dict(image_proj_state_dict)
+
+        self.encoder_hid_proj = image_projection.to(device=self.device, dtype=self.dtype)
+        self.config.encoder_hid_dim_type = "ip_image_proj"
+
 
 def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs):
     cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
@@ -3355,42 +3412,6 @@ def _remove_text_encoder_monkey_patch(self):
 class IPAdapterMixin:
     """Mixin for handling IP Adapters."""
 
-    def set_ip_adapter(self):
-        unet = self.unet
-        attn_procs = {}
-        for name in unet.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = unet.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = unet.config.block_out_channels[block_id]
-            if cross_attention_dim is None:
-                attn_processor_class = (
-                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
-                )
-                attn_procs[name] = attn_processor_class()
-            else:
-                attn_processor_class = (
-                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
-                )
-                attn_procs[name] = attn_processor_class(
-                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
-                ).to(dtype=unet.dtype, device=unet.device)
-
-        unet.set_attn_processor(attn_procs)
-
-        if hasattr(self, "controlnet"):
-            attn_processor_class = (
-                IPAdapterControlNetAttnProcessor2_0
-                if hasattr(F, "scaled_dot_product_attention")
-                else IPAdapterControlNetAttnProcessor
-            )
-            self.pipeline.controlnet.set_attn_processor(attn_processor_class())
-
     def load_ip_adapter(
         self,
         pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
@@ -3432,7 +3453,6 @@ def load_ip_adapter(
             subfolder (`str`, *optional*, defaults to `""`):
                 The subfolder location of a model file within a larger model repository on the Hub or locally.
         """
-        self.set_ip_adapter()
 
         # Load the main state dict first.
         cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
@@ -3491,39 +3511,8 @@ def load_ip_adapter(
         if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
             self.feature_extractor = CLIPImageProcessor()
 
-        # Handle image projection layers.
-        clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
-        cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
-
-        image_projection = ImageProjection(
-            cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim, num_image_text_embeds=4
-        )
-        image_projection.to(dtype=self.unet.dtype, device=self.unet.device)
-
-        diffusers_state_dict = {}
-
-        diffusers_state_dict.update(
-            {
-                "image_embeds.weight": state_dict["image_proj"]["proj.weight"],
-                "image_embeds.bias": state_dict["image_proj"]["proj.bias"],
-                "norm.weight": state_dict["image_proj"]["norm.weight"],
-                "norm.bias": state_dict["image_proj"]["norm.bias"],
-            }
-        )
-
-        image_projection.load_state_dict(diffusers_state_dict)
-
-        self.unet.encoder_hid_proj = image_projection.to(device=self.unet.device, dtype=self.unet.dtype)
-        self.unet.config.encoder_hid_dim_type = "ip_image_proj"
-
-        # Handle IP-Adapter cross-attention layers.
-        ip_layers = torch.nn.ModuleList(
-            [
-                module if isinstance(module, nn.Module) else nn.Identity()
-                for module in self.unet.attn_processors.values()
-            ]
-        )
-        ip_layers.load_state_dict(state_dict["ip_adapter"])
+        # load ip-adapter into unet
+        self.unet._load_ip_adapter_weights(state_dict)
 
     def set_ip_adapter_scale(self, scale):
         for attn_processor in self.unet.attn_processors.values():

From 10b79b5b2cc4d34edf8a37d6161cbc6096ef7f90 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 11 Nov 2023 06:01:26 +0000
Subject: [PATCH 118/139] update doc

---
 .../en/using-diffusers/loading_adapters.md    | 48 +++++++++----------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md
index 6372d0f568d9..eecb9d360f43 100644
--- a/docs/source/en/using-diffusers/loading_adapters.md
+++ b/docs/source/en/using-diffusers/loading_adapters.md
@@ -322,20 +322,15 @@ IP-Adapter was contributed by [okotaku](https://github.com/okotaku).
 
 </Tip>
 
-IP-Adapter relies on an image encoder to generate the image features, so let's load a [`~transformers.CLIPVisionModelWithProjection`] model and then pass it to a Stable Diffusion pipeline.
+Let's first create a Stable Diffusion Pipeline.
 
 ```py
-from diffusers import AutoPipelineForText2Image, CLIPVisionModelWithProjection
+from diffusers import AutoPipelineForText2Image
 import torch
 from diffusers.utils import load_image
 
-image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-    "h94/IP-Adapter", 
-    subfolder="models/image_encoder",
-    torch_dtype=torch.float16,
-).to("cuda")
 
-pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, torch_dtype=torch.float16).to("cuda")
+pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
 ```
 
 Now load the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) weights with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method. 
@@ -344,6 +339,23 @@ Now load the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) weights wit
 pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
 ```
 
+<Tip>
+IP-Adapter relies on an image encoder to generate the image features, if your IP-Adapter weights folder contains a "image_encoder" subfolder, the image encoder will be automatically loaded and registered to the pipeline. Otherwise you can so load a [`~transformers.CLIPVisionModelWithProjection`] model and  pass it to a Stable Diffusion pipeline when you create it.
+
+```py
+from diffusers import AutoPipelineForText2Image, CLIPVisionModelWithProjection
+import torch
+
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+    "h94/IP-Adapter", 
+    subfolder="models/image_encoder",
+    torch_dtype=torch.float16,
+).to("cuda")
+
+pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, torch_dtype=torch.float16).to("cuda")
+```
+</Tip>
+
 IP-Adapter allows you to use both image and text to condition the image generation process. For example, let's use the bear image from the [Textual Inversion](#textual-inversion) section as the image prompt (`ip_adapter_image`) along with a text prompt to add "sunglasses". 😎
 
 ```py
@@ -377,13 +389,7 @@ from diffusers import AutoPipelineForImage2Image
 import torch
 from diffusers.utils import load_image
 
-image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-    "h94/IP-Adapter", 
-    subfolder="models/image_encoder",
-    torch_dtype=torch.float16,
-).to("cuda")
-
-pipeline = AutoPipelineForImage2Image.from_pretrained("runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, torch_dtype=torch.float16).to("cuda")
+pipeline = AutoPipelineForImage2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
 
 image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg")
 ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/river.png")
@@ -401,25 +407,15 @@ images = pipeline(
 images[0]
 ```
 
-IP-Adapters can also be used with [SDXL](../api/pipelines/stable_diffusion/stable_diffusion_xl.md), but you'll also need to load a [`~transformers.CLIPImageProcessor`] as your feature extractor and pass it to the pipeline.
+IP-Adapters can also be used with [SDXL](../api/pipelines/stable_diffusion/stable_diffusion_xl.md)
 
 ```python
 from diffusers import AutoPipelineForText2Image
 from diffusers.utils import load_image
-from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor
 import torch
 
-image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-    "h94/IP-Adapter", 
-    subfolder="sdxl_models/image_encoder",
-    torch_dtype=torch.float16,
-).to("cuda")
-feature_extractor = CLIPImageProcessor.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
-  
 pipeline = AutoPipelineForText2Image.from_pretrained(
     "stabilityai/stable-diffusion-xl-base-1.0",
-    image_encoder=image_encoder,
-    feature_extractor=feature_extractor,
     torch_dtype=torch.float16
 ).to("cuda")
 

From e00dcfec5ad1c2735c73eb6b470b451f2774d442 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 13 Nov 2023 00:28:21 +0000
Subject: [PATCH 119/139] controlnet

---
 .../controlnet/pipeline_controlnet.py         | 38 ++++++++++++++++---
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
index 04ca51b19f05..b9fa260cee89 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -20,10 +20,10 @@
 import PIL.Image
 import torch
 import torch.nn.functional as F
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
@@ -92,7 +92,7 @@
 
 
 class StableDiffusionControlNetPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
@@ -102,6 +102,7 @@ class StableDiffusionControlNetPipeline(
 
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
     Args:
         vae ([`AutoencoderKL`]):
@@ -127,7 +128,7 @@ class StableDiffusionControlNetPipeline(
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor"]
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
     _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
@@ -140,6 +141,7 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
         requires_safety_checker: bool = True,
     ):
         super().__init__()
@@ -172,6 +174,7 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
@@ -428,6 +431,20 @@ def encode_prompt(
 
         return prompt_embeds, negative_prompt_embeds
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is None:
@@ -743,6 +760,7 @@ def __call__(
         latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -799,6 +817,7 @@ def __call__(
             negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated image. Choose between `PIL.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
@@ -908,6 +927,11 @@ def __call__(
         if do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
         # 4. Prepare image
         if isinstance(controlnet, ControlNetModel):
             image = self.prepare_image(
@@ -965,7 +989,10 @@ def __call__(
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-        # 7.1 Create tensor stating which controlnets to keep
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # 7.2 Create tensor stating which controlnets to keep
         controlnet_keep = []
         for i in range(len(timesteps)):
             keeps = [
@@ -1032,6 +1059,7 @@ def __call__(
                     cross_attention_kwargs=cross_attention_kwargs,
                     down_block_additional_residuals=down_block_res_samples,
                     mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
                     return_dict=False,
                 )[0]
 

From 60049ca58bba1c2612ba9c6dd19aa06193244a64 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 13 Nov 2023 04:13:50 +0000
Subject: [PATCH 120/139] animatediff

---
 src/diffusers/loaders.py                      | 18 +++++----
 src/diffusers/models/unet_motion_model.py     | 16 ++++++++
 .../animatediff/pipeline_animatediff.py       | 37 +++++++++++++++++--
 3 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 05dd31e9319d..d79333376660 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -767,8 +767,9 @@ def enable_lora(self):
         set_adapter_layers(self, enabled=True)
 
     def _load_ip_adapter_weights(self, state_dict):
-        # set ip-adapter cross-attention processors
+        # set ip-adapter cross-attention processors & load state_dict
         attn_procs = {}
+        key_id = 1
         for name in self.attn_processors.keys():
             cross_attention_dim = None if name.endswith("attn1.processor") else self.config.cross_attention_dim
             if name.startswith("mid_block"):
@@ -779,7 +780,7 @@ def _load_ip_adapter_weights(self, state_dict):
             elif name.startswith("down_blocks"):
                 block_id = int(name[len("down_blocks.")])
                 hidden_size = self.config.block_out_channels[block_id]
-            if cross_attention_dim is None:
+            if cross_attention_dim is None or "motion_modules" in name:
                 attn_processor_class = (
                     AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
                 )
@@ -792,13 +793,14 @@ def _load_ip_adapter_weights(self, state_dict):
                     hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
                 ).to(dtype=self.dtype, device=self.device)
 
-        self.set_attn_processor(attn_procs)
+                value_dict = {}
+                for k, w in attn_procs[name].state_dict().items():
+                    value_dict.update({f"{k}": state_dict["ip_adapter"][f"{key_id}.{k}"]})
 
-        # load ip-adapter cross-attention weights
-        ip_attn_layers = torch.nn.ModuleList(
-            [module if isinstance(module, nn.Module) else nn.Identity() for module in self.attn_processors.values()]
-        )
-        ip_attn_layers.load_state_dict(state_dict["ip_adapter"])
+                attn_procs[name].load_state_dict(value_dict)
+                key_id += 2
+
+        self.set_attn_processor(attn_procs)
 
         # create image projection layers.
         clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
diff --git a/src/diffusers/models/unet_motion_model.py b/src/diffusers/models/unet_motion_model.py
index 5d528a34ec96..3c5296dbda11 100644
--- a/src/diffusers/models/unet_motion_model.py
+++ b/src/diffusers/models/unet_motion_model.py
@@ -207,6 +207,8 @@ def __init__(
         motion_max_seq_length: Optional[int] = 32,
         motion_num_attention_heads: int = 8,
         use_motion_mid_block: int = True,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
     ):
         super().__init__()
 
@@ -247,6 +249,9 @@ def __init__(
             act_fn=act_fn,
         )
 
+        if encoder_hid_dim_type is None:
+            self.encoder_hid_proj = None
+
         # class embedding
         self.down_blocks = nn.ModuleList([])
         self.up_blocks = nn.ModuleList([])
@@ -685,6 +690,7 @@ def forward(
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
         down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
         mid_block_additional_residual: Optional[torch.Tensor] = None,
         return_dict: bool = True,
@@ -768,6 +774,16 @@ def forward(
 
         emb = self.time_embedding(t_emb, timestep_cond)
         emb = emb.repeat_interleave(repeats=num_frames, dim=0)
+
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds).to(encoder_hidden_states.dtype)
+            encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
+
         encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
 
         # 2. pre-process
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
index b63acb9a5f30..802eca438afc 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
@@ -18,10 +18,10 @@
 
 import numpy as np
 import torch
-from transformers import CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
-from ...image_processor import VaeImageProcessor
-from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel, UNetMotionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...models.unet_motion_model import MotionAdapter
@@ -77,7 +77,7 @@ class AnimateDiffPipelineOutput(BaseOutput):
     frames: Union[torch.Tensor, np.ndarray]
 
 
-class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.
 
@@ -100,6 +100,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLo
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder"]
 
     def __init__(
         self,
@@ -116,6 +117,8 @@ def __init__(
             EulerAncestralDiscreteScheduler,
             DPMSolverMultistepScheduler,
         ],
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
     ):
         super().__init__()
         unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
@@ -127,6 +130,8 @@ def __init__(
             unet=unet,
             motion_adapter=motion_adapter,
             scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -313,6 +318,20 @@ def encode_prompt(
 
         return prompt_embeds, negative_prompt_embeds
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
     # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
@@ -511,6 +530,7 @@ def __call__(
         latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -557,6 +577,7 @@ def __call__(
             negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
                 `np.array`.
@@ -628,6 +649,11 @@ def __call__(
         if do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_videos_per_prompt)
+            if do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
@@ -648,6 +674,8 @@ def __call__(
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
 
         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
@@ -663,6 +691,7 @@ def __call__(
                     t,
                     encoder_hidden_states=prompt_embeds,
                     cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
                 ).sample
 
                 # perform guidance

From 5641a64ec3ac15a995c47e1886ffc4fd30175af2 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 13 Nov 2023 05:02:55 +0000
Subject: [PATCH 121/139] fix tests + remove controlnet attn processor

---
 src/diffusers/models/attention_processor.py   | 164 ------------------
 .../pipelines/animatediff/test_animatediff.py |   2 +
 tests/pipelines/controlnet/test_controlnet.py |   3 +
 3 files changed, 5 insertions(+), 164 deletions(-)

diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 4201172b9725..8fffd1c09645 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -2217,170 +2217,6 @@ def __call__(
         return hidden_states
 
 
-class IPAdapterControlNetAttnProcessor:
-    r"""
-    Default processor for performing attention-related computations.
-    """
-
-    def __init__(self, text_context_len=77):
-        self.text_context_len = text_context_len
-
-    def __call__(
-        self,
-        attn,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-        scale=1.0,
-    ):
-        if scale != 1.0:
-            logger.warning("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
-        residual = hidden_states
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = encoder_hidden_states[:, : self.text_context_len]  # only use text
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
-class IPAdapterControlNetAttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
-    """
-
-    def __init__(self, text_context_len=77):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(
-                f"{self.__class__.__name__} requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
-            )
-        self.text_context_len = text_context_len
-
-    def __call__(
-        self,
-        attn,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-        scale=1.0,
-    ):
-        if scale != 1.0:
-            logger.warning("`scale` of IPAttnProcessor should be set by " "`IPAdapterPipeline.set_scale`")
-        residual = hidden_states
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = encoder_hidden_states[:, : self.text_context_len]
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
-
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
 LORA_ATTENTION_PROCESSORS = (
     LoRAAttnProcessor,
     LoRAAttnProcessor2_0,
diff --git a/tests/pipelines/animatediff/test_animatediff.py b/tests/pipelines/animatediff/test_animatediff.py
index 3c9390f2d1b6..5cd0a45c7406 100644
--- a/tests/pipelines/animatediff/test_animatediff.py
+++ b/tests/pipelines/animatediff/test_animatediff.py
@@ -99,6 +99,8 @@ def get_dummy_components(self):
             "motion_adapter": motion_adapter,
             "text_encoder": text_encoder,
             "tokenizer": tokenizer,
+            "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 
diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py
index 64baeea910b8..111e21f82076 100644
--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -181,6 +181,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 
@@ -317,6 +318,7 @@ def init_weights(m):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 
@@ -494,6 +496,7 @@ def init_weights(m):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 

From 9d94e20d0f3f6eb8cb98342d16b6c853b1daba60 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 13 Nov 2023 07:03:46 +0000
Subject: [PATCH 122/139] sdxl

---
 .../controlnet/pipeline_controlnet_sd_xl.py   | 61 +++++++++++++++++--
 .../controlnet/test_controlnet_sdxl.py        |  6 ++
 2 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
index d6278c4f046a..619da583f7c6 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -20,12 +20,23 @@
 import PIL.Image
 import torch
 import torch.nn.functional as F
-from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
 
 from diffusers.utils.import_utils import is_invisible_watermark_available
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
@@ -97,7 +108,11 @@
 
 
 class StableDiffusionXLControlNetPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet guidance.
@@ -141,7 +156,14 @@ class StableDiffusionXLControlNetPipeline(
     """
     # leave controlnet out on purpose because it iterates with unet
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
-    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "feature_extractor",
+        "image_encoder",
+    ]
 
     def __init__(
         self,
@@ -155,6 +177,8 @@ def __init__(
         scheduler: KarrasDiffusionSchedulers,
         force_zeros_for_empty_prompt: bool = True,
         add_watermarker: Optional[bool] = None,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
     ):
         super().__init__()
 
@@ -170,6 +194,8 @@ def __init__(
             unet=unet,
             controlnet=controlnet,
             scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
@@ -453,6 +479,20 @@ def encode_prompt(
 
         return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
     def prepare_extra_step_kwargs(self, generator, eta):
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -812,6 +852,7 @@ def __call__(
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -891,6 +932,7 @@ def __call__(
                 Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt
                 weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input
                 argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated image. Choose between `PIL.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
@@ -1010,7 +1052,7 @@ def __call__(
         )
         guess_mode = guess_mode or global_pool_conditions
 
-        # 3. Encode input prompt
+        # 3.1 Encode input prompt
         text_encoder_lora_scale = (
             cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
         )
@@ -1035,6 +1077,12 @@ def __call__(
             clip_skip=clip_skip,
         )
 
+        # 3.2 Encode ip_adapter_image
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
         # 4. Prepare image
         if isinstance(controlnet, ControlNetModel):
             image = self.prepare_image(
@@ -1200,6 +1248,9 @@ def __call__(
                     down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
                     mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
 
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py
index be786ebe3000..89a7959cd703 100644
--- a/tests/pipelines/controlnet/test_controlnet_sdxl.py
+++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py
@@ -144,6 +144,8 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "text_encoder_2": text_encoder_2,
             "tokenizer_2": tokenizer_2,
+            "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 
@@ -449,6 +451,8 @@ def init_weights(m):
             "tokenizer": tokenizer,
             "text_encoder_2": text_encoder_2,
             "tokenizer_2": tokenizer_2,
+            "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 
@@ -634,6 +638,8 @@ def init_weights(m):
             "tokenizer": tokenizer,
             "text_encoder_2": text_encoder_2,
             "tokenizer_2": tokenizer_2,
+            "feature_extractor": None,
+            "image_encoder": None,
         }
         return components
 

From fed72fb3b74cd7844f419d2d0c9e0dcdf6ee31aa Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 13 Nov 2023 20:30:48 +0000
Subject: [PATCH 123/139] add doc

---
 .../en/using-diffusers/loading_adapters.md    | 195 +++++++++++++++++-
 1 file changed, 193 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md
index eecb9d360f43..772d11a7545a 100644
--- a/docs/source/en/using-diffusers/loading_adapters.md
+++ b/docs/source/en/using-diffusers/loading_adapters.md
@@ -312,7 +312,8 @@ image
 
 [IP-Adapter](https://ip-adapter.github.io/) is an effective and lightweight adapter that adds image prompting capabilities to a diffusion model. This adapter works by decoupling the cross-attention layers of the image and text features. All the other model components are frozen and only the embedded image features in the UNet are trained. As a result, IP-Adapter files are typically only ~100MBs.
 
-IP-Adapter works with most of our Stable Diffusion, Stable Diffusion XL (SDXL), ControlNet, T2I-Adapter, and any custom models finetuned from the same base models.
+IP-Adapter works with most of our pipelines, including Stable Diffusion, Stable Diffusion XL (SDXL), ControlNet, T2I-Adapter, AnimateDiff.  And you can use any custom models finetuned from the same base models. It also works with LCM-Lora out of box.
+
 
 <Tip>
 
@@ -382,7 +383,10 @@ You can use the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method to adjus
 `scale=0.5` can achieve good results in most cases when you use both text and image prompts.
 </Tip>
 
-IP-Adapter also works great with Image-to-Image and Inpainting pipelines. Here is an example of how you can use it with Image-to-Image.
+IP-Adapter also works great with Image-to-Image and Inpainting pipelines. See below examples of how you can use it with Image-to-Image and Inpaint.
+
+<hfoptions id="tasks">
+<hfoption id="image-to-image">
 
 ```py
 from diffusers import AutoPipelineForImage2Image
@@ -407,6 +411,42 @@ images = pipeline(
 images[0]
 ```
 
+</hfoption>
+<hfoption id="inpaint">
+
+```py
+from diffusers import AutoPipelineForInpaint
+import torch
+from diffusers.utils import load_image
+
+pipeline = AutoPipelineForInpaint.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float).to("cuda")
+
+image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/inpaint_image.png")
+mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/mask.png")
+ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/girl.png")
+
+image = image.resize((512, 768))
+mask = mask.resize((512, 768))
+
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+generator = torch.Generator(device="cpu").manual_seed(33)
+images = pipeline(
+    prompt='best quality, high quality', 
+    image = image,
+    mask_image = mask,
+    ip_adapter_image=ip_image,
+    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
+    num_inference_steps=50,
+    generator=generator,
+    strength=0.5,
+).images
+images[0]
+```
+</hfoption>
+</hfoptions>
+
+
 IP-Adapters can also be used with [SDXL](../api/pipelines/stable_diffusion/stable_diffusion_xl.md)
 
 ```python
@@ -444,3 +484,154 @@ image.save("sdxl_t2i.png")
     <figcaption class="mt-2 text-center text-sm text-gray-500">adapted image</figcaption>
   </div>
 </div>
+
+
+### LCM-Lora
+
+You can use IP-Adapter with LCM-Lora to achieve "instant fine-tune" with custom images. Note that you need to load IP-Adapter weights before loading the LCM-Lora weights.
+
+```py
+from diffusers import DiffusionPipeline, LCMScheduler
+import torch
+from diffusers.utils import load_image
+
+model_id =  "sd-dreambooth-library/herge-style"
+lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
+
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+
+pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+pipe.load_lora_weights(lcm_lora_id)
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+pipe.enable_model_cpu_offload()
+
+prompt = "best quality, high quality"
+image = load_image("https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png")
+images = pipe(
+    prompt=prompt,
+    ip_adapter_image=image,
+    num_inference_steps=4,
+    guidance_scale=1,
+).images[0]
+```
+
+### Other pipelines
+
+IP-Adapter is compatible with any pipeline that (1) uses a text prompt and (2) uses Stable Diffusion or Stable Diffusion XL checkpoint. To use IP-Adapter with a different pipeline, all you need to do is to run `load_ip_adapter()` method after you create the pipeline, and then pass your image to the pipeline as `ip_adapter_image`
+
+<Tip>
+
+🤗 Diffusers currently only supports using IP-Adapter with some of the most popular pipelines, feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you have a cool use-case and require integrating IP-adapters with a pipeline that does not support it yet!
+
+</Tip>
+
+You can find below examples on how to use IP-Adapter with ControlNet and AnimateDiff. 
+
+<hfoptions id="model">
+<hfoption id="ControlNet">
+
+```
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+import torch
+from diffusers.utils import load_image
+
+controlnet_model_path = "lllyasviel/control_v11f1p_sd15_depth"
+controlnet = ControlNetModel.from_pretrained(controlnet_model_path, torch_dtype=torch.float16)
+
+pipeline = StableDiffusionControlNetPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16)
+pipeline.to("cuda")
+
+image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/statue.png")
+depth_map = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/depth.png")
+
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+generator = torch.Generator(device="cpu").manual_seed(33)
+images = pipeline(
+    prompt='best quality, high quality', 
+    image=depth_map,
+    ip_adapter_image=image,
+    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
+    num_inference_steps=50,
+    generator=generator,
+).images
+images[0]
+```
+<div class="flex flex-row gap-4">
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/statue.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">input image</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ipa-controlnet-out.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">adapted image</figcaption>
+  </div>
+</div>
+
+</hfoption>
+<hfoption id="AnimateDiff">
+
+```py
+# animate diff + ip adapter
+import torch
+from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
+from diffusers.utils import export_to_gif, load_image
+
+# Load the motion adapter
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
+# load SD 1.5 based finetuned model
+model_id = "Lykon/DreamShaper"
+pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16)
+
+# scheduler
+scheduler = DDIMScheduler(
+    clip_sample=False,
+    beta_start=0.00085,
+    beta_end=0.012,
+    beta_schedule="linear",
+    timestep_spacing="trailing",
+    steps_offset=1
+)
+pipe.scheduler = scheduler
+
+# enable memory savings
+pipe.enable_vae_slicing()
+pipe.enable_model_cpu_offload()
+
+# load ip_adapter
+pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+# load motion adapters
+pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-out", adapter_name="zoom-out")
+pipe.load_lora_weights("guoyww/animatediff-motion-lora-tilt-up", adapter_name="tilt-up")
+pipe.load_lora_weights("guoyww/animatediff-motion-lora-pan-left", adapter_name="pan-left")
+
+seed = 42
+image = load_image("https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png")
+images = [image] * 3
+prompts = ["best quality, high quality"] * 3
+negative_prompt = "bad quality, worst quality"
+adapter_weights = [[0.75, 0.0, 0.0], [0.0, 0.0, 0.75], [0.0, 0.75, 0.75]]
+
+# generate
+output_frames = []
+for prompt, image, adapter_weight in zip(prompts, images, adapter_weights):
+    pipe.set_adapters(["zoom-out", "tilt-up", "pan-left"], adapter_weights=adapter_weight)
+    output = pipe(
+      prompt= prompt,
+      num_frames=16,
+      guidance_scale=7.5,
+      num_inference_steps=30,
+      ip_adapter_image = image,
+      generator=torch.Generator("cpu").manual_seed(seed),
+    )
+    frames = output.frames[0]
+    output_frames.extend(frames)
+
+export_to_gif(output_frames, "test_out_animation.gif") 
+```
+
+</hfoption>
+</hfoptions>
+

From f46c2e46d7ff4803493d96eed9a4250bc80b36ca Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 16 Nov 2023 01:10:46 +0000
Subject: [PATCH 124/139] fix circular import

---
 src/diffusers/loaders/unet.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
index a642bd1cb649..4f26dad882fa 100644
--- a/src/diffusers/loaders/unet.py
+++ b/src/diffusers/loaders/unet.py
@@ -21,12 +21,6 @@
 import torch.nn.functional as F
 from torch import nn
 
-from ..models.attention_processor import (
-    AttnProcessor,
-    AttnProcessor2_0,
-    IPAdapterAttnProcessor,
-    IPAdapterAttnProcessor2_0,
-)
 from ..models.embeddings import ImageProjection
 from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
 from ..utils import (
@@ -578,6 +572,13 @@ def delete_adapters(self, adapter_names: Union[List[str], str]):
                 self.peft_config.pop(adapter_name, None)
 
     def _load_ip_adapter_weights(self, state_dict):
+        from ..models.attention_processor import (
+            AttnProcessor,
+            AttnProcessor2_0,
+            IPAdapterAttnProcessor,
+            IPAdapterAttnProcessor2_0,
+        )
+
         # set ip-adapter cross-attention processors & load state_dict
         attn_procs = {}
         key_id = 1

From 3203eeb6dcce530b00f572c74f7851f294ae2397 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 16 Nov 2023 01:20:42 +0000
Subject: [PATCH 125/139] fix

---
 src/diffusers/loaders/ip_adapter.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py
index 0f8a99860a58..e1905aee32ca 100644
--- a/src/diffusers/loaders/ip_adapter.py
+++ b/src/diffusers/loaders/ip_adapter.py
@@ -17,10 +17,6 @@
 import safetensors
 import torch
 
-from ..models.attention_processor import (
-    IPAdapterAttnProcessor,
-    IPAdapterAttnProcessor2_0,
-)
 from ..utils import (
     DIFFUSERS_CACHE,
     HF_HUB_OFFLINE,
@@ -145,6 +141,11 @@ def load_ip_adapter(
         self.unet._load_ip_adapter_weights(state_dict)
 
     def set_ip_adapter_scale(self, scale):
+        from ..models.attention_processor import (
+            IPAdapterAttnProcessor,
+            IPAdapterAttnProcessor2_0,
+        )
+
         for attn_processor in self.unet.attn_processors.values():
             if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
                 attn_processor.scale = scale

From b40e94fc88ff22ca947e13cff8975105acac08e8 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 16 Nov 2023 01:53:49 +0000
Subject: [PATCH 126/139] fix

---
 src/diffusers/loaders/ip_adapter.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py
index e1905aee32ca..45f7d06c9d35 100644
--- a/src/diffusers/loaders/ip_adapter.py
+++ b/src/diffusers/loaders/ip_adapter.py
@@ -32,6 +32,11 @@
         CLIPVisionModelWithProjection,
     )
 
+    from ..models.attention_processor import (
+        IPAdapterAttnProcessor,
+        IPAdapterAttnProcessor2_0,
+    )
+
 logger = logging.get_logger(__name__)
 
 
@@ -141,11 +146,6 @@ def load_ip_adapter(
         self.unet._load_ip_adapter_weights(state_dict)
 
     def set_ip_adapter_scale(self, scale):
-        from ..models.attention_processor import (
-            IPAdapterAttnProcessor,
-            IPAdapterAttnProcessor2_0,
-        )
-
         for attn_processor in self.unet.attn_processors.values():
             if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
                 attn_processor.scale = scale

From fae2a05b4728e1b2035385a174951bbda51145eb Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 16 Nov 2023 02:25:24 +0000
Subject: [PATCH 127/139] fix

---
 src/diffusers/loaders/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/loaders/__init__.py b/src/diffusers/loaders/__init__.py
index 90d4e297e8e8..684736856029 100644
--- a/src/diffusers/loaders/__init__.py
+++ b/src/diffusers/loaders/__init__.py
@@ -73,6 +73,7 @@ def text_encoder_attn_modules(text_encoder):
         from .utils import AttnProcsLayers
 
         if is_transformers_available():
+            from .ip_adapter import IPAdapterMixin
             from .lora import LoraLoaderMixin, StableDiffusionXLLoraLoaderMixin
             from .single_file import FromSingleFileMixin
             from .textual_inversion import TextualInversionLoaderMixin

From 8fe9798d13360276b698d6775b9f6630c33b3d8e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 16 Nov 2023 03:01:16 +0000
Subject: [PATCH 128/139] fix

---
 src/diffusers/models/lora.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/diffusers/models/lora.py b/src/diffusers/models/lora.py
index d1e90a4a7e93..9edec19a3a34 100644
--- a/src/diffusers/models/lora.py
+++ b/src/diffusers/models/lora.py
@@ -77,8 +77,6 @@ def text_encoder_lora_state_dict(text_encoder):
 
 
 def adjust_lora_scale_text_encoder(text_encoder, lora_scale: float = 1.0):
-    from ..loaders import PatchedLoraProjection, text_encoder_attn_modules, text_encoder_mlp_modules
-
     for _, attn_module in text_encoder_attn_modules(text_encoder):
         if isinstance(attn_module.q_proj, PatchedLoraProjection):
             attn_module.q_proj.lora_scale = lora_scale

From f97a7971c173d771daa1773733c0c5013b2461dd Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 15 Nov 2023 17:12:38 -1000
Subject: [PATCH 129/139] Update src/diffusers/pipelines/pipeline_utils.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/pipeline_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 34002d50a766..32e705ecc8c6 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -657,9 +657,7 @@ def is_saveable_module(name, value):
             return True
 
         model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)}
-        print(f"From save_pretrained: {model_index_dict.keys()}")
         for pipeline_component_name in model_index_dict.keys():
-            print(f"From save_pretrained: {pipeline_component_name}")
             sub_model = getattr(self, pipeline_component_name)
             model_cls = sub_model.__class__
 

From c6078788e287d5861a59d30fed7042347da8c494 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 15 Nov 2023 17:12:54 -1000
Subject: [PATCH 130/139] Update src/diffusers/pipelines/pipeline_utils.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/pipeline_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 32e705ecc8c6..4c607c550612 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -516,7 +516,6 @@ def load_sub_model(
             loading_kwargs["low_cpu_mem_usage"] = False
 
     # check if the module is in a subdirectory
-    print(f"From loading module: cached_folder: {cached_folder} name: {name}")
     if os.path.isdir(os.path.join(cached_folder, name)):
         loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs)
     else:

From 2c2c607e95a287909fabc3c9db1f279111262f53 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 16 Nov 2023 09:20:25 +0000
Subject: [PATCH 131/139] update tests

---
 tests/models/test_models_unet_2d_condition.py |  85 +++++++-
 .../test_ip_adapter_stable_diffusion.py}      | 193 ------------------
 2 files changed, 84 insertions(+), 194 deletions(-)
 rename tests/{models/test_ip_adapters.py => pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py} (56%)

diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 0db336a88029..db9c2f8a53bc 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -24,7 +24,8 @@
 from pytest import mark
 
 from diffusers import UNet2DConditionModel
-from diffusers.models.attention_processor import CustomDiffusionAttnProcessor
+from diffusers.models.attention_processor import CustomDiffusionAttnProcessor, IPAdapterAttnProcessor
+from diffusers.models.embeddings import ImageProjection
 from diffusers.utils import logging
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
@@ -45,6 +46,58 @@
 enable_full_determinism()
 
 
+def create_ip_adapter_state_dict(model):
+    # "ip_adapter" (cross-attention weights)
+    ip_cross_attn_state_dict = {}
+    sd = model.state_dict()
+    key_id = 1
+
+    for name in model.attn_processors.keys():
+        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = model.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = model.config.block_out_channels[block_id]
+        if cross_attention_dim is not None:
+            sd = IPAdapterAttnProcessor(
+                hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
+            ).state_dict()
+            ip_cross_attn_state_dict.update(
+                {
+                    f"{key_id}.to_k_ip.weight": sd["to_k_ip.weight"],
+                    f"{key_id}.to_v_ip.weight": sd["to_v_ip.weight"],
+                }
+            )
+
+            key_id += 2
+
+    # "image_proj" (ImageProjection layer weights)
+    cross_attention_dim = model.config["cross_attention_dim"]
+    image_projection = ImageProjection(
+        cross_attention_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, num_image_text_embeds=4
+    )
+
+    ip_image_projection_state_dict = {}
+    sd = image_projection.state_dict()
+    ip_image_projection_state_dict.update(
+        {
+            "proj.weight": sd["image_embeds.weight"],
+            "proj.bias": sd["image_embeds.bias"],
+            "norm.weight": sd["norm.weight"],
+            "norm.bias": sd["norm.bias"],
+        }
+    )
+
+    del sd
+    ip_state_dict = {}
+    ip_state_dict.update({"image_proj": ip_image_projection_state_dict, "ip_adapter": ip_cross_attn_state_dict})
+    return ip_state_dict
+
+
 def create_custom_diffusion_layers(model, mock_weights: bool = True):
     train_kv = True
     train_q_out = True
@@ -622,6 +675,36 @@ def test_asymmetrical_unet(self):
         # Check if input and output shapes are the same
         self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
+    def test_ip_adapter(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        with torch.no_grad():
+            sample1 = model(**inputs_dict).sample
+
+        ip_adapter_state_dict = create_ip_adapter_state_dict(model)
+        model._load_ip_adapter_weights(ip_adapter_state_dict)
+
+        assert model.config.encoder_hid_dim_type == "ip_image_proj"
+        assert model.encoder_hid_proj is not None
+
+        assert (
+            model.down_blocks[0].attentions[0].transformer_blocks[0].attn2.processor.__class__.__name__
+            == "IPAdapterAttnProcessor"
+        )
+
+        batch_size = inputs_dict["encoder_hidden_states"].shape[0]
+        image_embeds = floats_tensor((batch_size, 1, model.cross_attention_dim)).to(torch_device)
+
+        with torch.no_grad():
+            sample2 = model(**{**inputs_dict, "added_cond_kwargs": {"image_embeds": image_embeds}}).sample
+
+        assert not sample1.allclose(sample2, atol=1e-4, rtol=1e-4)
+
 
 @slow
 class UNet2DConditionModelIntegrationTests(unittest.TestCase):
diff --git a/tests/models/test_ip_adapters.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py
similarity index 56%
rename from tests/models/test_ip_adapters.py
rename to tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py
index e06bf6e8350c..57eb49013c1f 100644
--- a/tests/models/test_ip_adapters.py
+++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py
@@ -18,34 +18,19 @@
 
 import numpy as np
 import torch
-import torch.nn.functional as F
 from transformers import (
     CLIPImageProcessor,
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTokenizer,
-    CLIPVisionConfig,
     CLIPVisionModelWithProjection,
 )
 
 from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
     StableDiffusionImg2ImgPipeline,
     StableDiffusionInpaintPipeline,
     StableDiffusionPipeline,
     StableDiffusionXLImg2ImgPipeline,
     StableDiffusionXLInpaintPipeline,
     StableDiffusionXLPipeline,
-    UNet2DConditionModel,
 )
-from diffusers.models.attention_processor import (
-    AttnProcessor,
-    AttnProcessor2_0,
-    IPAdapterAttnProcessor,
-    IPAdapterAttnProcessor2_0,
-)
-from diffusers.models.embeddings import ImageProjection
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
@@ -58,184 +43,6 @@
 enable_full_determinism()
 
 
-class IPAdapterFastTests(unittest.TestCase):
-    hidden_dim = 32
-    num_image_text_embeds = 4
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(4, 8),
-            layers_per_block=1,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=self.hidden_dim,
-            norm_num_groups=2,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[4, 8],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            norm_num_groups=2,
-        )
-
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=self.hidden_dim,
-            intermediate_size=64,
-            layer_norm_eps=1e-05,
-            num_attention_heads=8,
-            num_hidden_layers=3,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        torch.manual_seed(0)
-        image_encoder_config = CLIPVisionConfig(
-            hidden_size=self.hidden_dim,
-            projection_dim=self.hidden_dim,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            image_size=32,
-            intermediate_size=37,
-            patch_size=1,
-        )
-        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-            "image_encoder": image_encoder,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0, with_image=False):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "np",
-        }
-        if with_image:
-            inputs.update({"ip_adapter_image": torch.randn(1, 3, 32, 32, generator=generator)})
-        return inputs
-
-    def get_attn_procs_for_ip_adapter(self, unet):
-        # Cross-attention modules.
-        attn_procs = {}
-        for name in unet.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = unet.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = unet.config.block_out_channels[block_id]
-            if cross_attention_dim is None:
-                attn_processor_class = (
-                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
-                )
-                attn_procs[name] = attn_processor_class()
-            else:
-                attn_processor_class = (
-                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
-                )
-                attn_procs[name] = attn_processor_class(
-                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
-                ).to(dtype=unet.dtype, device=unet.device)
-        return attn_procs
-
-    def get_ip_adapter_state_dict(self, unet):
-        # Image projection module.
-        image_projection = ImageProjection(
-            cross_attention_dim=self.hidden_dim, image_embed_dim=self.hidden_dim, num_image_text_embeds=4
-        )
-
-        # Attention modules.
-        attn_procs = self.get_attn_procs_for_ip_adapter(unet)
-
-        # Rename the keys.
-        cross_attention_params = {}
-        key_id = 1
-        for key, value in attn_procs.items():
-            if isinstance(attn_procs[key], torch.nn.Module):
-                current_sd = attn_procs[key].state_dict()
-                current_sd = {f"{key_id}.{k}": v for k, v in current_sd.items()}
-                cross_attention_params.update(current_sd)
-                key_id += 2
-
-        # Make it compatible.
-        image_projection_sd = image_projection.state_dict()
-        new_image_projection_sd = {}
-        for k in image_projection_sd:
-            if "image_embeds" in k:
-                new_k = k.replace("image_embeds", "proj")
-            else:
-                new_k = k
-            new_image_projection_sd.update({new_k: image_projection_sd[k]})
-
-        # Final.
-        final_state_dict = {}
-        final_state_dict.update({"image_proj": new_image_projection_sd, "ip_adapter": cross_attention_params})
-        return final_state_dict
-
-    def test_inference_fast(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        ip_adapter_state_dict = self.get_ip_adapter_state_dict(components["unet"])
-        sd_pipe.load_ip_adapter(ip_adapter_state_dict)
-        inputs = self.get_dummy_inputs(device, with_image=True)
-        output_ip_adapter = sd_pipe(**inputs).images
-
-        assert output_ip_adapter.shape == (1, 64, 64, 3)
-
-        assert not np.allclose(image_slice, output_ip_adapter[0, -3:, -3:, -1], atol=1e-4, rtol=1e-4)
-
-
 class IPAdapterNightlyTestsMixin(unittest.TestCase):
     dtype = torch.float16
 

From dc1b7eb5e9e276b299707930de693266e3938a12 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 16 Nov 2023 09:47:12 +0000
Subject: [PATCH 132/139] style

---
 tests/models/test_models_unet_2d_condition.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index db9c2f8a53bc..4f99cd00a929 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -692,9 +692,9 @@ def test_ip_adapter(self):
         assert model.config.encoder_hid_dim_type == "ip_image_proj"
         assert model.encoder_hid_proj is not None
 
-        assert (
-            model.down_blocks[0].attentions[0].transformer_blocks[0].attn2.processor.__class__.__name__
-            == "IPAdapterAttnProcessor"
+        assert model.down_blocks[0].attentions[0].transformer_blocks[0].attn2.processor.__class__.__name__ in (
+            "IPAdapterAttnProcessor",
+            "IPAdapterAttnProcessor2_0",
         )
 
         batch_size = inputs_dict["encoder_hidden_states"].shape[0]

From dd67bcdfba9b741e3f18471284b450518c9ba485 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 18 Nov 2023 00:11:59 +0000
Subject: [PATCH 133/139] text_context_len -> num_tokens

---
 src/diffusers/models/attention_processor.py | 26 +++++++++++----------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 8fffd1c09645..6b86ba66db37 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -1984,18 +1984,18 @@ class IPAdapterAttnProcessor(nn.Module):
             The hidden size of the attention layer.
         cross_attention_dim (`int`):
             The number of channels in the `encoder_hidden_states`.
-        text_context_len (`int`, defaults to 77):
-            The context length of the text features.
+        num_tokens (`int`, defaults to 4):
+            The context length of the image features.
         scale (`float`, defaults to 1.0):
             the weight scale of image prompt.
     """
 
-    def __init__(self, hidden_size, cross_attention_dim=None, text_context_len=77, scale=1.0):
+    def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=4, scale=1.0):
         super().__init__()
 
         self.hidden_size = hidden_size
         self.cross_attention_dim = cross_attention_dim
-        self.text_context_len = text_context_len
+        self.num_tokens = num_tokens
         self.scale = scale
 
         self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
@@ -2039,9 +2039,10 @@ def __call__(
             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         # split hidden states
+        end_pos = encoder_hidden_states.shape[1] - self.num_tokens
         encoder_hidden_states, ip_hidden_states = (
-            encoder_hidden_states[:, : self.text_context_len, :],
-            encoder_hidden_states[:, self.text_context_len :, :],
+            encoder_hidden_states[:, :end_pos, :],
+            encoder_hidden_states[:, end_pos:, :],
         )
 
         key = attn.to_k(encoder_hidden_states)
@@ -2093,13 +2094,13 @@ class IPAdapterAttnProcessor2_0(torch.nn.Module):
             The hidden size of the attention layer.
         cross_attention_dim (`int`):
             The number of channels in the `encoder_hidden_states`.
-        text_context_len (`int`, defaults to 77):
-            The context length of the text features.
+        num_tokens (`int`, defaults to 4):
+            The context length of the image features.
         scale (`float`, defaults to 1.0):
             the weight scale of image prompt.
     """
 
-    def __init__(self, hidden_size, cross_attention_dim=None, text_context_len=77, scale=1.0):
+    def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=4, scale=1.0):
         super().__init__()
 
         if not hasattr(F, "scaled_dot_product_attention"):
@@ -2109,7 +2110,7 @@ def __init__(self, hidden_size, cross_attention_dim=None, text_context_len=77, s
 
         self.hidden_size = hidden_size
         self.cross_attention_dim = cross_attention_dim
-        self.text_context_len = text_context_len
+        self.num_tokens = num_tokens
         self.scale = scale
 
         self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
@@ -2158,9 +2159,10 @@ def __call__(
             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         # split hidden states
+        end_pos = encoder_hidden_states.shape[1] - self.num_tokens
         encoder_hidden_states, ip_hidden_states = (
-            encoder_hidden_states[:, : self.text_context_len, :],
-            encoder_hidden_states[:, self.text_context_len :, :],
+            encoder_hidden_states[:, :end_pos, :],
+            encoder_hidden_states[:, end_pos:, :],
         )
 
         key = attn.to_k(encoder_hidden_states)

From d4edc4e536e0a98dfe27a78a0fe9d3e531d3cd06 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 18 Nov 2023 00:28:28 +0000
Subject: [PATCH 134/139] fix

---
 src/diffusers/pipelines/controlnet/pipeline_controlnet.py       | 2 +-
 src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
index 2473d45bd44d..193352edbe0b 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -967,7 +967,7 @@ def __call__(
 
         if ip_adapter_image is not None:
             image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
-            if do_classifier_free_guidance:
+            if self.do_classifier_free_guidance:
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
         # 4. Prepare image
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
index 6d395a79357b..cf7de7b02601 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -1118,7 +1118,7 @@ def __call__(
         # 3.2 Encode ip_adapter_image
         if ip_adapter_image is not None:
             image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
-            if do_classifier_free_guidance:
+            if self.do_classifier_free_guidance:
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
         # 4. Prepare image

From 75022d0e2439610e7628dcdf0c292e54b4633d5e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 18 Nov 2023 00:53:13 +0000
Subject: [PATCH 135/139] support safetensors + make subfolder and weight_name
 required argument

---
 src/diffusers/loaders/ip_adapter.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py
index 45f7d06c9d35..32c558554be2 100644
--- a/src/diffusers/loaders/ip_adapter.py
+++ b/src/diffusers/loaders/ip_adapter.py
@@ -14,8 +14,8 @@
 import os
 from typing import Dict, Union
 
-import safetensors
 import torch
+from safetensors import safe_open
 
 from ..utils import (
     DIFFUSERS_CACHE,
@@ -46,6 +46,8 @@ class IPAdapterMixin:
     def load_ip_adapter(
         self,
         pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        subfolder: str,
+        weight_name: str,
         **kwargs,
     ):
         """
@@ -93,8 +95,6 @@ def load_ip_adapter(
         local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
         use_auth_token = kwargs.pop("use_auth_token", None)
         revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        weight_name = kwargs.pop("weight_name", None)
 
         user_agent = {
             "file_type": "attn_procs_weights",
@@ -116,7 +116,13 @@ def load_ip_adapter(
                 user_agent=user_agent,
             )
             if weight_name.endswith(".safetensors"):
-                state_dict = safetensors.torch.load_file(model_file, device="cpu")
+                state_dict = {"image_proj": {}, "ip_adapter": {}}
+                with safe_open(model_file, framework="pt", device="cpu") as f:
+                    for key in f.keys():
+                        if key.startswith("image_proj."):
+                            state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                        elif key.startswith("ip_adapter."):
+                            state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
             else:
                 state_dict = torch.load(model_file, map_location="cpu")
         else:

From aaba4d473198893977942eda65a6eaa99f23f14e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 20 Nov 2023 20:14:45 +0000
Subject: [PATCH 136/139] make aggresive tests

---
 tests/models/test_models_unet_2d_condition.py | 33 +++++++++++++++----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 4f99cd00a929..06bf2685560d 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -49,7 +49,6 @@
 def create_ip_adapter_state_dict(model):
     # "ip_adapter" (cross-attention weights)
     ip_cross_attn_state_dict = {}
-    sd = model.state_dict()
     key_id = 1
 
     for name in model.attn_processors.keys():
@@ -683,27 +682,47 @@ def test_ip_adapter(self):
         model = self.model_class(**init_dict)
         model.to(torch_device)
 
+        # forward pass without ip-adapter
         with torch.no_grad():
             sample1 = model(**inputs_dict).sample
 
-        ip_adapter_state_dict = create_ip_adapter_state_dict(model)
-        model._load_ip_adapter_weights(ip_adapter_state_dict)
+        # update inputs_dict for ip-adapter
+        batch_size = inputs_dict["encoder_hidden_states"].shape[0]
+        image_embeds = floats_tensor((batch_size, 1, model.cross_attention_dim)).to(torch_device)
+        inputs_dict["added_cond_kwargs"] = {"image_embeds": image_embeds}
+
+        # make ip_adapter_1 and ip_adapter_2
+        ip_adapter_1 = create_ip_adapter_state_dict(model)
 
+        image_proj_state_dict_2 = {k: w + 1.0 for k, w in ip_adapter_1["image_proj"].items()}
+        cross_attn_state_dict_2 = {k: w + 1.0 for k, w in ip_adapter_1["ip_adapter"].items()}
+        ip_adapter_2 = {}
+        ip_adapter_2.update({"image_proj": image_proj_state_dict_2, "ip_adapter": cross_attn_state_dict_2})
+
+        # forward pass ip_adapter_1
+        model._load_ip_adapter_weights(ip_adapter_1)
         assert model.config.encoder_hid_dim_type == "ip_image_proj"
         assert model.encoder_hid_proj is not None
-
         assert model.down_blocks[0].attentions[0].transformer_blocks[0].attn2.processor.__class__.__name__ in (
             "IPAdapterAttnProcessor",
             "IPAdapterAttnProcessor2_0",
         )
+        with torch.no_grad():
+            sample2 = model(**inputs_dict).sample
 
-        batch_size = inputs_dict["encoder_hidden_states"].shape[0]
-        image_embeds = floats_tensor((batch_size, 1, model.cross_attention_dim)).to(torch_device)
+        # forward pass with ip_adapter_2
+        model._load_ip_adapter_weights(ip_adapter_2)
+        with torch.no_grad():
+            sample3 = model(**inputs_dict).sample
 
+        # forward pass with ip_adapter_1 again
+        model._load_ip_adapter_weights(ip_adapter_1)
         with torch.no_grad():
-            sample2 = model(**{**inputs_dict, "added_cond_kwargs": {"image_embeds": image_embeds}}).sample
+            sample4 = model(**inputs_dict).sample
 
         assert not sample1.allclose(sample2, atol=1e-4, rtol=1e-4)
+        assert not sample2.allclose(sample3, atol=1e-4, rtol=1e-4)
+        assert sample2.allclose(sample4, atol=1e-4, rtol=1e-4)
 
 
 @slow

From 2fd1685860de9c308056ca3fafc9fd75ba00e794 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 20 Nov 2023 20:44:14 +0000
Subject: [PATCH 137/139] copies

---
 src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py  | 1 -
 .../pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py    | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index a2a12d166f2e..4e166575fdc0 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -109,7 +109,6 @@ class AltDiffusionPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
-
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
     _exclude_from_cpu_offload = ["safety_checker"]
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 6c341f302593..563c1d963ed5 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -145,7 +145,6 @@ class AltDiffusionImg2ImgPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
-
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
     _exclude_from_cpu_offload = ["safety_checker"]

From 0162a4536ac37a29d1ccf4d1e7171915c2269f04 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 20 Nov 2023 20:51:08 +0000
Subject: [PATCH 138/139] add

---
 src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py  | 1 +
 .../pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index 7fe26499518b..843e3b8b9410 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -109,6 +109,7 @@ class AltDiffusionPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
     _exclude_from_cpu_offload = ["safety_checker"]
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 9334f4a1e17f..b196ac4d3f69 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -145,6 +145,7 @@ class AltDiffusionImg2ImgPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
     _exclude_from_cpu_offload = ["safety_checker"]

From 304c790ded2f3a2b95b57fe5726ebe76ee114e1b Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 20 Nov 2023 21:00:34 +0000
Subject: [PATCH 139/139] fix

---
 .../pipelines/versatile_diffusion/modeling_text_unet.py        | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 8e9a6db21a77..a940cec5e46a 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -1224,8 +1224,7 @@ def forward(
         elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
             if "image_embeds" not in added_cond_kwargs:
                 raise ValueError(
-                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which"
-                    " requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
                 )
             image_embeds = added_cond_kwargs.get("image_embeds")
             image_embeds = self.encoder_hid_proj(image_embeds).to(encoder_hidden_states.dtype)