optimize

wkpark · wkpark · commit ac9b19dc7e60 · 2024-09-08T15:11:46.000+09:00
* replace rearrange to view AUTOMATIC1111#15804 * see also lllyasviel/stable-diffusion-webui-forge@79adfa8 * conditional use torch.rms_norm for torch 2.4 * fix RMSNorm() for clear: use torch.ones()
diff --git a/modules/models/flux/math.py b/modules/models/flux/math.py
@@ -7,7 +7,9 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
     q, k = apply_rope(q, k, pe)
 
     x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
-    x = rearrange(x, "B H L D -> B L (H D)")
+    #x = rearrange(x, "B H L D -> B L (H D)")
+    B, H, L, D = x.shape
+    x = x.permute(0, 2, 1, 3).contiguous().view(B, L, H * D)
 
     return x
 
@@ -17,9 +19,13 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
     scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
     omega = 1.0 / (theta**scale)
     out = torch.einsum("...n,d->...nd", pos, omega)
+    #out = pos.unsqueeze(-1) * omega.unsqueeze(0)
+
     out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
-    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
-    return out.float()
+    #out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    b, n, d, _ = out.shape
+    out = out.view(b, n, d, 2, 2)
+    return out.to(dtype=torch.float32, device=pos.device)
 
 
 def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
diff --git a/modules/models/flux/modules/layers.py b/modules/models/flux/modules/layers.py
@@ -58,16 +58,28 @@ def forward(self, x: Tensor) -> Tensor:
         return self.out_layer(self.silu(self.in_layer(x)))
 
 
+def rms_norm(x, normalized_shape, weight, eps):
+    if hasattr(torch, 'rms_norm'): # torch 2.4
+        return torch.rms_norm(x, normalized_shape, weight, eps)
+
+    if x.dtype in [torch.bfloat16, torch.float32]:
+        n = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + eps) * weight
+    else:
+        n = torch.rsqrt(torch.mean(x.float() ** 2, dim=-1, keepdim=True) + eps).to(x.dtype) * weight
+    return x * n
+
+
 class RMSNorm(torch.nn.Module):
     def __init__(self, dim: int, dtype=None, device=None):
         super().__init__()
-        self.scale = nn.Parameter(torch.empty((dim), dtype=dtype, device=device))
+        self.scale = nn.Parameter(torch.ones((dim), dtype=dtype, device=device))
+        self.normalized_shape = [dim]
 
     def forward(self, x: Tensor):
-        x_dtype = x.dtype
-        x = x.float()
-        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
-        return (x * rrms).to(dtype=x_dtype) * self.scale
+        if self.scale.dtype != x.dtype:
+            self.scale = nn.Parameter(self.scale.to(dtype=x.dtype), requires_grad=x.requires_grad)
+
+        return rms_norm(x, self.normalized_shape, self.scale, 1e-6)
 
 
 class QKNorm(torch.nn.Module):
@@ -98,7 +110,9 @@ def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, dtype=N
 
     def forward(self, x: Tensor, pe: Tensor) -> Tensor:
         qkv = self.qkv(x)
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        #q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        B, L, _ = qkv.shape
+        q, k, v = qkv.view(B, L, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
         q, k = self.norm(q, k, v)
         x = attention(q, k, v, pe=pe)
         x = self.proj(x)
@@ -165,14 +179,18 @@ def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor):
         img_modulated = self.img_norm1(img)
         img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
         img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        #img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        B, L, _ = img_qkv.shape
+        img_q, img_k, img_v = img_qkv.view(B, L, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
         img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
 
         # prepare txt for attention
         txt_modulated = self.txt_norm1(txt)
         txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
         txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        #txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        B, L, _ = txt_qkv.shape
+        txt_q, txt_k, txt_v = txt_qkv.view(B, L, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
         txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
 
         # run actual attention
@@ -238,7 +256,9 @@ def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
         x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
         qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
 
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        #q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        B, L, _ = qkv.shape
+        q, k, v = qkv.view(B, L, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
         q, k = self.norm(q, k, v)
 
         # compute attention