ddayto21
diff --git a/‎.eslintrc.js
+2 b/‎.eslintrc.js
+2
diff --git a/‎.github/workflows/on_pull_request.yaml
+5-5 b/‎.github/workflows/on_pull_request.yaml
+5-5
diff --git a/‎.github/workflows/run_tests.yaml
+5-5 b/‎.github/workflows/run_tests.yaml
+5-5
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎CHANGELOG.md
+9-9 b/‎CHANGELOG.md
+9-9
diff --git a/‎README.md
+1 b/‎README.md
+1
diff --git a/‎_typos.toml
+5 b/‎_typos.toml
+5
diff --git a/‎extensions-builtin/LDSR/sd_hijack_ddpm_v1.py
+4-4 b/‎extensions-builtin/LDSR/sd_hijack_ddpm_v1.py
+4-4
diff --git a/‎extensions-builtin/Lora/lyco_helpers.py
+1-1 b/‎extensions-builtin/Lora/lyco_helpers.py
+1-1
diff --git a/‎extensions-builtin/Lora/network.py
+33 b/‎extensions-builtin/Lora/network.py
+33
diff --git a/‎extensions-builtin/Lora/network_oft.py
+7-7 b/‎extensions-builtin/Lora/network_oft.py
+7-7
diff --git a/‎extensions-builtin/Lora/networks.py
+7-4 b/‎extensions-builtin/Lora/networks.py
+7-4
@@ -78,6 +78,8 @@ module.exports = {
         //extraNetworks.js
         requestGet: "readonly",
         popup: "readonly",
+        // profilerVisualization.js
+        createVisualizationTable: "readonly",
         // from python
         localization: "readonly",
         // progrssbar.js
 
@@ -11,16 +11,16 @@ jobs:
     if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name
     steps:
       - name: Checkout Code
-        uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+        uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: 3.11
           # NB: there's no cache: pip here since we're not installing anything
           #     from the requirements.txt file(s) in the repository; it's faster
           #     not to have GHA download an (at the time of writing) 4 GB cache
           #     of PyTorch and other dependencies.
       - name: Install Ruff
-        run: pip install ruff==0.1.6
+        run: pip install ruff==0.3.3
       - name: Run Ruff
         run: ruff .
   lint-js:
@@ -29,9 +29,9 @@ jobs:
     if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name
     steps:
       - name: Checkout Code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Install Node.js
-        uses: actions/setup-node@v3
+        uses: actions/setup-node@v4
         with:
           node-version: 18
       - run: npm i --ci
 
@@ -11,9 +11,9 @@ jobs:
     if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name
     steps:
       - name: Checkout Code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Set up Python 3.10
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: 3.10.6
           cache: pip
@@ -22,7 +22,7 @@ jobs:
             launch.py
       - name: Cache models
         id: cache-models
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: models
           key: "2023-12-30"
@@ -68,13 +68,13 @@ jobs:
           python -m coverage report -i
           python -m coverage html -i
       - name: Upload main app output
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         if: always()
         with:
           name: output
           path: output.txt
       - name: Upload coverage HTML
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         if: always()
         with:
           name: htmlcov
 
@@ -38,3 +38,4 @@ notification.mp3
 /package-lock.json
 /.coverage*
 /test/test_outputs
+/cache
@@ -14,7 +14,7 @@
 * Add support for DAT upscaler models ([#14690](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14690), [#15039](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/15039))
 * Extra Networks Tree View ([#14588](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14588), [#14900](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14900))
 * NPU Support ([#14801](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14801))
-* Propmpt comments support
+* Prompt comments support
 
 ### Minor:
 * Allow pasting in WIDTHxHEIGHT strings into the width/height fields ([#14296](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14296))
@@ -59,7 +59,7 @@
 * modules/api/api.py: add api endpoint to refresh embeddings list ([#14715](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14715))
 * set_named_arg ([#14773](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14773))
 * add before_token_counter callback and use it for prompt comments
-* ResizeHandleRow - allow overriden column scale parameter ([#15004](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/15004))
+* ResizeHandleRow - allow overridden column scale parameter ([#15004](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/15004))
 
 ### Performance
 * Massive performance improvement for extra networks directories with a huge number of files in them in an attempt to tackle #14507 ([#14528](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14528))
@@ -101,7 +101,7 @@
 * Gracefully handle mtime read exception from cache ([#14933](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14933))
 * Only trigger interrupt on `Esc` when interrupt button visible ([#14932](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14932))
 * Disable prompt token counters option actually disables token counting rather than just hiding results.
-* avoid doble upscaling in inpaint ([#14966](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14966))
+* avoid double upscaling in inpaint ([#14966](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14966))
 * Fix #14591 using translated content to do categories mapping ([#14995](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14995))
 * fix: the `split_threshold` parameter does not work when running Split oversized images ([#15006](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/15006))
 * Fix resize-handle for mobile ([#15010](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/15010), [#15065](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/15065))
@@ -171,7 +171,7 @@
 * infotext updates: add option to disregard certain infotext fields, add option to not include VAE in infotext, add explanation to infotext settings page, move some options to infotext settings page
 * add FP32 fallback support on sd_vae_approx ([#14046](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14046))
 * support XYZ scripts / split hires path from unet ([#14126](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14126))
-* allow use of mutiple styles csv files ([#14125](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14125))
+* allow use of multiple styles csv files ([#14125](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14125))
 * make extra network card description plaintext by default, with an option (Treat card description as HTML) to re-enable HTML as it was (originally by [#13241](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/13241))
 
 ### Extensions and API:
@@ -308,7 +308,7 @@
  * new samplers: Restart, DPM++ 2M SDE Exponential, DPM++ 2M SDE Heun, DPM++ 2M SDE Heun Karras, DPM++ 2M SDE Heun Exponential, DPM++ 3M SDE, DPM++ 3M SDE Karras, DPM++ 3M SDE Exponential ([#12300](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/12300), [#12519](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/12519), [#12542](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/12542))
  * rework DDIM, PLMS, UniPC to use CFG denoiser same as in k-diffusion samplers:
    * makes all of them work with img2img
-   * makes prompt composition posssible (AND)
+   * makes prompt composition possible (AND)
    * makes them available for SDXL
  * always show extra networks tabs in the UI ([#11808](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/11808))
  * use less RAM when creating models ([#11958](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/11958), [#12599](https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/12599))
@@ -484,7 +484,7 @@
  * user metadata system for custom networks
  * extended Lora metadata editor: set activation text, default weight, view tags, training info
  * Lora extension rework to include other types of networks (all that were previously handled by LyCORIS extension)
- * show github stars for extenstions
+ * show github stars for extensions
  * img2img batch mode can read extra stuff from png info
  * img2img batch works with subdirectories
  * hotkeys to move prompt elements: alt+left/right
@@ -703,7 +703,7 @@
  * do not wait for Stable Diffusion model to load at startup
  * add filename patterns: `[denoising]`
  * directory hiding for extra networks: dirs starting with `.` will hide their cards on extra network tabs unless specifically searched for
- * LoRA: for the `<...>` text in prompt, use name of LoRA that is in the metdata of the file, if present, instead of filename (both can be used to activate LoRA)
+ * LoRA: for the `<...>` text in prompt, use name of LoRA that is in the metadata of the file, if present, instead of filename (both can be used to activate LoRA)
  * LoRA: read infotext params from kohya-ss's extension parameters if they are present and if his extension is not active
  * LoRA: fix some LoRAs not working (ones that have 3x3 convolution layer)
  * LoRA: add an option to use old method of applying LoRAs (producing same results as with kohya-ss)
@@ -733,7 +733,7 @@
  * fix gamepad navigation
  * make the lightbox fullscreen image function properly
  * fix squished thumbnails in extras tab
- * keep "search" filter for extra networks when user refreshes the tab (previously it showed everthing after you refreshed)
+ * keep "search" filter for extra networks when user refreshes the tab (previously it showed everything after you refreshed)
  * fix webui showing the same image if you configure the generation to always save results into same file
  * fix bug with upscalers not working properly
  * fix MPS on PyTorch 2.0.1, Intel Macs
@@ -751,7 +751,7 @@
  * switch to PyTorch 2.0.0 (except for AMD GPUs)
  * visual improvements to custom code scripts
  * add filename patterns: `[clip_skip]`, `[hasprompt<>]`, `[batch_number]`, `[generation_number]`
- * add support for saving init images in img2img, and record their hashes in infotext for reproducability
+ * add support for saving init images in img2img, and record their hashes in infotext for reproducibility
  * automatically select current word when adjusting weight with ctrl+up/down
  * add dropdowns for X/Y/Z plot
  * add setting: Stable Diffusion/Random number generator source: makes it possible to make images generated from a given manual seed consistent across different GPUs
 
@@ -98,6 +98,7 @@ Make sure the required [dependencies](https://github.com/AUTOMATIC1111/stable-di
 - [NVidia](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Install-and-Run-on-NVidia-GPUs) (recommended)
 - [AMD](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Install-and-Run-on-AMD-GPUs) GPUs.
 - [Intel CPUs, Intel GPUs (both integrated and discrete)](https://github.com/openvinotoolkit/stable-diffusion-webui/wiki/Installation-on-Intel-Silicon) (external wiki page)
+- [Ascend NPUs](https://github.com/wangshuai09/stable-diffusion-webui/wiki/Install-and-run-on-Ascend-NPUs) (external wiki page)
 
 Alternatively, use online services (like Google Colab):
 
 
@@ -0,0 +1,5 @@
+[default.extend-words]
+# Part of "RGBa" (Pillow's pre-multiplied alpha RGB mode)
+Ba = "Ba"
+# HSA is something AMD uses for their GPUs
+HSA = "HSA"
@@ -301,7 +301,7 @@ def p_losses(self, x_start, t, noise=None):
         elif self.parameterization == "x0":
             target = x_start
         else:
-            raise NotImplementedError(f"Paramterization {self.parameterization} not yet supported")
+            raise NotImplementedError(f"Parameterization {self.parameterization} not yet supported")
 
         loss = self.get_loss(model_out, target, mean=False).mean(dim=[1, 2, 3])
 
@@ -880,7 +880,7 @@ def forward(self, x, c, *args, **kwargs):
     def apply_model(self, x_noisy, t, cond, return_ids=False):
 
         if isinstance(cond, dict):
-            # hybrid case, cond is exptected to be a dict
+            # hybrid case, cond is expected to be a dict
             pass
         else:
             if not isinstance(cond, list):
@@ -916,7 +916,7 @@ def apply_model(self, x_noisy, t, cond, return_ids=False):
                 cond_list = [{c_key: [c[:, :, :, :, i]]} for i in range(c.shape[-1])]
 
             elif self.cond_stage_key == 'coordinates_bbox':
-                assert 'original_image_size' in self.split_input_params, 'BoudingBoxRescaling is missing original_image_size'
+                assert 'original_image_size' in self.split_input_params, 'BoundingBoxRescaling is missing original_image_size'
 
                 # assuming padding of unfold is always 0 and its dilation is always 1
                 n_patches_per_row = int((w - ks[0]) / stride[0] + 1)
@@ -926,7 +926,7 @@ def apply_model(self, x_noisy, t, cond, return_ids=False):
                 num_downs = self.first_stage_model.encoder.num_resolutions - 1
                 rescale_latent = 2 ** (num_downs)
 
-                # get top left postions of patches as conforming for the bbbox tokenizer, therefore we
+                # get top left positions of patches as conforming for the bbbox tokenizer, therefore we
                 # need to rescale the tl patch coordinates to be in between (0,1)
                 tl_patch_coordinates = [(rescale_latent * stride[0] * (patch_nr % n_patches_per_row) / full_img_w,
                                          rescale_latent * stride[1] * (patch_nr // n_patches_per_row) / full_img_h)
 
@@ -30,7 +30,7 @@ def factorization(dimension: int, factor:int=-1) -> tuple[int, int]:
     In LoRA with Kroneckor Product, first value is a value for weight scale.
     secon value is a value for weight.
 
-    Becuase of non-commutative property, A⊗B ≠ B⊗A. Meaning of two matrices is slightly different.
+    Because of non-commutative property, A⊗B ≠ B⊗A. Meaning of two matrices is slightly different.
 
     examples)
     factor
 
@@ -117,6 +117,12 @@ def __init__(self, net: Network, weights: NetworkWeights):
 
         if hasattr(self.sd_module, 'weight'):
             self.shape = self.sd_module.weight.shape
+        elif isinstance(self.sd_module, nn.MultiheadAttention):
+            # For now, only self-attn use Pytorch's MHA
+            # So assume all qkvo proj have same shape
+            self.shape = self.sd_module.out_proj.weight.shape
+        else:
+            self.shape = None
 
         self.ops = None
         self.extra_kwargs = {}
@@ -146,6 +152,9 @@ def __init__(self, net: Network, weights: NetworkWeights):
         self.alpha = weights.w["alpha"].item() if "alpha" in weights.w else None
         self.scale = weights.w["scale"].item() if "scale" in weights.w else None
 
+        self.dora_scale = weights.w.get("dora_scale", None)
+        self.dora_norm_dims = len(self.shape) - 1
+
     def multiplier(self):
         if 'transformer' in self.sd_key[:20]:
             return self.network.te_multiplier
@@ -160,6 +169,27 @@ def calc_scale(self):
 
         return 1.0
 
+    def apply_weight_decompose(self, updown, orig_weight):
+        # Match the device/dtype
+        orig_weight = orig_weight.to(updown.dtype)
+        dora_scale = self.dora_scale.to(device=orig_weight.device, dtype=updown.dtype)
+        updown = updown.to(orig_weight.device)
+
+        merged_scale1 = updown + orig_weight
+        merged_scale1_norm = (
+            merged_scale1.transpose(0, 1)
+            .reshape(merged_scale1.shape[1], -1)
+            .norm(dim=1, keepdim=True)
+            .reshape(merged_scale1.shape[1], *[1] * self.dora_norm_dims)
+            .transpose(0, 1)
+        )
+
+        dora_merged = (
+            merged_scale1 * (dora_scale / merged_scale1_norm)
+        )
+        final_updown = dora_merged - orig_weight
+        return final_updown
+
     def finalize_updown(self, updown, orig_weight, output_shape, ex_bias=None):
         if self.bias is not None:
             updown = updown.reshape(self.bias.shape)
@@ -175,6 +205,9 @@ def finalize_updown(self, updown, orig_weight, output_shape, ex_bias=None):
         if ex_bias is not None:
             ex_bias = ex_bias * self.multiplier()
 
+        if self.dora_scale is not None:
+            updown = self.apply_weight_decompose(updown, orig_weight)
+
         return updown * self.calc_scale() * self.multiplier(), ex_bias
 
     def calc_updown(self, target):
 
@@ -36,13 +36,6 @@ def __init__(self,  net: network.Network, weights: network.NetworkWeights):
             # self.alpha is unused
             self.dim = self.oft_blocks.shape[1] # (num_blocks, block_size, block_size)
 
-        # LyCORIS BOFT
-        if self.oft_blocks.dim() == 4:
-            self.is_boft = True
-        self.rescale = weights.w.get('rescale', None)
-        if self.rescale is not None:
-            self.rescale = self.rescale.reshape(-1, *[1]*(self.org_module[0].weight.dim() - 1))
-
         is_linear = type(self.sd_module) in [torch.nn.Linear, torch.nn.modules.linear.NonDynamicallyQuantizableLinear]
         is_conv = type(self.sd_module) in [torch.nn.Conv2d]
         is_other_linear = type(self.sd_module) in [torch.nn.MultiheadAttention] # unsupported
@@ -54,6 +47,13 @@ def __init__(self,  net: network.Network, weights: network.NetworkWeights):
         elif is_other_linear:
             self.out_dim = self.sd_module.embed_dim
 
+        # LyCORIS BOFT
+        if self.oft_blocks.dim() == 4:
+            self.is_boft = True
+        self.rescale = weights.w.get('rescale', None)
+        if self.rescale is not None and not is_other_linear:
+            self.rescale = self.rescale.reshape(-1, *[1]*(self.org_module[0].weight.dim() - 1))
+
         self.num_blocks = self.dim
         self.block_size = self.out_dim // self.dim
         self.constraint = (0 if self.alpha is None else self.alpha) * self.out_dim
 
@@ -355,7 +355,7 @@ def network_apply_weights(self: Union[torch.nn.Conv2d, torch.nn.Linear, torch.nn
     """
     Applies the currently selected set of networks to the weights of torch layer self.
     If weights already have this particular set of networks applied, does nothing.
-    If not, restores orginal weights from backup and alters weights according to networks.
+    If not, restores original weights from backup and alters weights according to networks.
     """
 
     network_layer_name = getattr(self, 'network_layer_name', None)
@@ -429,9 +429,12 @@ def network_apply_weights(self: Union[torch.nn.Conv2d, torch.nn.Linear, torch.nn
             if isinstance(self, torch.nn.MultiheadAttention) and module_q and module_k and module_v and module_out:
                 try:
                     with torch.no_grad():
-                        updown_q, _ = module_q.calc_updown(self.in_proj_weight)
-                        updown_k, _ = module_k.calc_updown(self.in_proj_weight)
-                        updown_v, _ = module_v.calc_updown(self.in_proj_weight)
+                        # Send "real" orig_weight into MHA's lora module
+                        qw, kw, vw = self.in_proj_weight.chunk(3, 0)
+                        updown_q, _ = module_q.calc_updown(qw)
+                        updown_k, _ = module_k.calc_updown(kw)
+                        updown_v, _ = module_v.calc_updown(vw)
+                        del qw, kw, vw
                         updown_qkv = torch.vstack([updown_q, updown_k, updown_v])
                         updown_out, ex_bias = module_out.calc_updown(self.out_proj.weight)