Sygil-Dev
diff --git a/‎.dockerignore
+1-1 b/‎.dockerignore
+1-1
diff --git a/‎.gitattributes
+1-1 b/‎.gitattributes
+1-1
diff --git a/‎.github/ISSUE_TEMPLATE/bug_report.yml
+3-3 b/‎.github/ISSUE_TEMPLATE/bug_report.yml
+3-3
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md
+1-1 b/‎.github/PULL_REQUEST_TEMPLATE.md
+1-1
diff --git a/‎.github/workflows/deploy.yml
+1-1 b/‎.github/workflows/deploy.yml
+1-1
diff --git a/‎.github/workflows/test-deploy.yml
+1-1 b/‎.github/workflows/test-deploy.yml
+1-1
diff --git a/‎README.md
+14-14 b/‎README.md
+14-14
diff --git a/‎Stable_Diffusion_v1_Model_Card.md
+9-10 b/‎Stable_Diffusion_v1_Model_Card.md
+9-10
diff --git a/‎Web_based_UI_for_Stable_Diffusion_colab.ipynb
+1-1 b/‎Web_based_UI_for_Stable_Diffusion_colab.ipynb
+1-1
diff --git a/‎blog/2022-10-20/1.Textual inversion usage competitio.md
+3-3 b/‎blog/2022-10-20/1.Textual inversion usage competitio.md
+3-3
diff --git a/‎configs/blip/bert_config.json
+1-1 b/‎configs/blip/bert_config.json
+1-1
diff --git a/‎configs/blip/caption_coco.yaml
+1-2 b/‎configs/blip/caption_coco.yaml
+1-2
diff --git a/‎configs/blip/med_config.json
+1-1 b/‎configs/blip/med_config.json
+1-1
diff --git a/‎configs/blip/nlvr.yaml
+3-4 b/‎configs/blip/nlvr.yaml
+3-4
diff --git a/‎configs/blip/nocaps.yaml
+1-1 b/‎configs/blip/nocaps.yaml
+1-1
diff --git a/‎configs/blip/pretrain.yaml
+1-4 b/‎configs/blip/pretrain.yaml
+1-4
diff --git a/‎configs/blip/retrieval_coco.yaml
-1 b/‎configs/blip/retrieval_coco.yaml
-1
diff --git a/‎configs/blip/retrieval_flickr.yaml
-1 b/‎configs/blip/retrieval_flickr.yaml
-1
diff --git a/‎configs/blip/retrieval_msrvtt.yaml
+1-1 b/‎configs/blip/retrieval_msrvtt.yaml
+1-1
diff --git a/‎configs/blip/vqa.yaml
+3-3 b/‎configs/blip/vqa.yaml
+3-3
diff --git a/‎configs/latent-diffusion/celebahq-ldm-vq-4.yaml
+1-1 b/‎configs/latent-diffusion/celebahq-ldm-vq-4.yaml
+1-1
diff --git a/‎configs/latent-diffusion/cin-ldm-vq-f8.yaml
+1-1 b/‎configs/latent-diffusion/cin-ldm-vq-f8.yaml
+1-1
diff --git a/‎configs/latent-diffusion/cin256-v2.yaml
+3-3 b/‎configs/latent-diffusion/cin256-v2.yaml
+3-3
diff --git a/‎configs/latent-diffusion/ffhq-ldm-vq-4.yaml
+1-1 b/‎configs/latent-diffusion/ffhq-ldm-vq-4.yaml
+1-1
@@ -1,3 +1,3 @@
 outputs/
 src/
-configs/webui/userconfig_streamlit.yaml
+configs/webui/userconfig_streamlit.yaml
@@ -1,4 +1,4 @@
 * text=auto
 *.{cmd,[cC][mM][dD]} text eol=crlf
 *.{bat,[bB][aA][tT]} text eol=crlf
-*.sh text eol=lf
+*.sh text eol=lf
@@ -40,7 +40,7 @@ body:
   - type: dropdown
     id: os
     attributes:
-      label: Where are you running the webui? 
+      label: Where are you running the webui?
       multiple: true
       options:
         - Windows
@@ -52,7 +52,7 @@ body:
     attributes:
       label: Custom settings
       description: If you are running the webui with specifi settings, please paste them here for reference (like --nitro)
-      render: shell        
+      render: shell
   - type: textarea
     id: logs
     attributes:
@@ -66,4 +66,4 @@ body:
       description: By submitting this issue, you agree to follow our [Code of Conduct](https://docs.github.com/en/site-policy/github-terms/github-community-code-of-conduct)
       options:
         - label: I agree to follow this project's Code of Conduct
-          required: true
+          required: true
@@ -13,4 +13,4 @@ Closes: # (issue)
 - [ ] I have changed the base branch to `dev`
 - [ ] I have performed a self-review of my own code
 - [ ] I have commented my code in hard-to-understand areas
-- [ ] I have made corresponding changes to the documentation
+- [ ] I have made corresponding changes to the documentation
@@ -37,4 +37,4 @@ jobs:
           # The GH actions bot is used by default if you didn't specify the two fields.
           # You can swap them out with your own user credentials.
           user_name: github-actions[bot]
-          user_email: 41898282+github-actions[bot]@users.noreply.github.com
+          user_email: 41898282+github-actions[bot]@users.noreply.github.com
@@ -21,4 +21,4 @@ jobs:
       - name: Install dependencies
         run: yarn install
       - name: Test build website
-        run: yarn build
+        run: yarn build
@@ -6,7 +6,7 @@
 
 ## Installation instructions for:
 
-- **[Windows](https://sygil-dev.github.io/sygil-webui/docs/Installation/windows-installation)** 
+- **[Windows](https://sygil-dev.github.io/sygil-webui/docs/Installation/windows-installation)**
 - **[Linux](https://sygil-dev.github.io/sygil-webui/docs/Installation/linux-installation)**
 
 ### Want to ask a question or request a feature?
@@ -34,10 +34,10 @@ Check the [Contribution Guide](CONTRIBUTING.md)
 
 * Run additional upscaling models on CPU to save VRAM
 
-* Textual inversion: [Reaserch Paper](https://textual-inversion.github.io/) 
+* Textual inversion: [Reaserch Paper](https://textual-inversion.github.io/)
 
 * K-Diffusion Samplers: A great collection of samplers to use, including:
-  
+
   - `k_euler`
   - `k_lms`
   - `k_euler_a`
@@ -95,8 +95,8 @@ An easy way to work with Stable Diffusion right from your browser.
 To give a token (tag recognized by the AI) a specific or increased weight (emphasis), add `:0.##` to the prompt, where `0.##` is a decimal that will specify the weight of all tokens before the colon.
 Ex: `cat:0.30, dog:0.70` or `guy riding a bicycle :0.7, incoming car :0.30`
 
-Negative prompts can be added by using  `###` , after which any tokens will be seen as negative. 
-Ex: `cat playing with string ### yarn` will negate `yarn` from the generated image. 
+Negative prompts can be added by using  `###` , after which any tokens will be seen as negative.
+Ex: `cat playing with string ### yarn` will negate `yarn` from the generated image.
 
 Negatives are a very powerful tool to get rid of contextually similar or related topics, but **be careful when adding them since the AI might see connections you can't**, and end up outputting gibberish
 
@@ -131,7 +131,7 @@ Lets you improve faces in pictures using the GFPGAN model. There is a checkbox i
 
 If you want to use GFPGAN to improve generated faces, you need to install it separately.
 Download [GFPGANv1.4.pth](https://github.com/TencentARC/GFPGAN/releases/download/v1.3.4/GFPGANv1.4.pth) and put it
-into the `/sygil-webui/models/gfpgan` directory. 
+into the `/sygil-webui/models/gfpgan` directory.
 
 ### RealESRGAN
 
@@ -141,7 +141,7 @@ Lets you double the resolution of generated images. There is a checkbox in every
 There is also a separate tab for using RealESRGAN on any picture.
 
 Download [RealESRGAN_x4plus.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth) and [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth).
-Put them into the `sygil-webui/models/realesrgan` directory. 
+Put them into the `sygil-webui/models/realesrgan` directory.
 
 ### LSDR
 
@@ -174,8 +174,8 @@ which is available on [GitHub](https://github.com/CompVis/latent-diffusion). PDF
 
 [Stable Diffusion](#stable-diffusion-v1) is a latent text-to-image diffusion
 model.
-Thanks to a generous compute donation from [Stability AI](https://stability.ai/) and support from [LAION](https://laion.ai/), we were able to train a Latent Diffusion Model on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database. 
-Similar to Google's [Imagen](https://arxiv.org/abs/2205.11487), 
+Thanks to a generous compute donation from [Stability AI](https://stability.ai/) and support from [LAION](https://laion.ai/), we were able to train a Latent Diffusion Model on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database.
+Similar to Google's [Imagen](https://arxiv.org/abs/2205.11487),
 this model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts.
 With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 10GB VRAM.
 See [this section](#stable-diffusion-v1) below and the [model card](https://huggingface.co/CompVis/stable-diffusion).
@@ -184,26 +184,26 @@ See [this section](#stable-diffusion-v1) below and the [model card](https://hugg
 
 Stable Diffusion v1 refers to a specific configuration of the model
 architecture that uses a downsampling-factor 8 autoencoder with an 860M UNet
-and CLIP ViT-L/14 text encoder for the diffusion model. The model was pretrained on 256x256 images and 
+and CLIP ViT-L/14 text encoder for the diffusion model. The model was pretrained on 256x256 images and
 then finetuned on 512x512 images.
 
 *Note: Stable Diffusion v1 is a general text-to-image diffusion model and therefore mirrors biases and (mis-)conceptions that are present
-in its training data. 
+in its training data.
 Details on the training procedure and data, as well as the intended use of the model can be found in the corresponding [model card](https://huggingface.co/CompVis/stable-diffusion).
 
 ## Comments
 
 - Our code base for the diffusion models builds heavily on [OpenAI's ADM codebase](https://github.com/openai/guided-diffusion)
-  and [https://github.com/lucidrains/denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch). 
+  and [https://github.com/lucidrains/denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch).
   Thanks for open-sourcing!
 
-- The implementation of the transformer encoder is from [x-transformers](https://github.com/lucidrains/x-transformers) by [lucidrains](https://github.com/lucidrains?tab=repositories). 
+- The implementation of the transformer encoder is from [x-transformers](https://github.com/lucidrains/x-transformers) by [lucidrains](https://github.com/lucidrains?tab=repositories).
 
 ## BibTeX
 
 ```
 @misc{rombach2021highresolution,
-      title={High-Resolution Image Synthesis with Latent Diffusion Models}, 
+      title={High-Resolution Image Synthesis with Latent Diffusion Models},
       author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer},
       year={2021},
       eprint={2112.10752},
 
@@ -21,7 +21,7 @@ This model card focuses on the model associated with the Stable Diffusion model,
 
 # Uses
 
-## Direct Use 
+## Direct Use
 The model is intended for research purposes only. Possible research areas and
 tasks include
 
@@ -68,11 +68,11 @@ Using the model to generate content that is cruel to individuals is a misuse of
   considerations.
 
 ### Bias
-While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases. 
-Stable Diffusion v1 was trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/), 
-which consists of images that are primarily limited to English descriptions. 
-Texts and images from communities and cultures that use other languages are likely to be insufficiently accounted for. 
-This affects the overall output of the model, as white and western cultures are often set as the default. Further, the 
+While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases.
+Stable Diffusion v1 was trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/),
+which consists of images that are primarily limited to English descriptions.
+Texts and images from communities and cultures that use other languages are likely to be insufficiently accounted for.
+This affects the overall output of the model, as white and western cultures are often set as the default. Further, the
 ability of the model to generate content with non-English prompts is significantly worse than with English-language prompts.
 
 
@@ -84,7 +84,7 @@ The model developers used the following dataset for training the model:
 - LAION-2B (en) and subsets thereof (see next section)
 
 **Training Procedure**
-Stable Diffusion v1 is a latent diffusion model which combines an autoencoder with a diffusion model that is trained in the latent space of the autoencoder. During training, 
+Stable Diffusion v1 is a latent diffusion model which combines an autoencoder with a diffusion model that is trained in the latent space of the autoencoder. During training,
 
 - Images are encoded through an encoder, which turns images into latent representations. The autoencoder uses a relative downsampling factor of 8 and maps images of shape H x W x 3 to latents of shape H/f x W/f x 4
 - Text prompts are encoded through a ViT-L/14 text-encoder.
@@ -108,12 +108,12 @@ filtered to images with an original size `>= 512x512`, estimated aesthetics scor
 - **Batch:** 32 x 8 x 2 x 4 = 2048
 - **Learning rate:** warmup to 0.0001 for 10,000 steps and then kept constant
 
-## Evaluation Results 
+## Evaluation Results
 Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0,
 5.0, 6.0, 7.0, 8.0) and 50 PLMS sampling
 steps show the relative improvements of the checkpoints:
 
-![pareto](assets/v1-variants-scores.jpg) 
+![pareto](assets/v1-variants-scores.jpg)
 
 Evaluated using 50 PLMS steps and 10000 random prompts from the COCO2017 validation set, evaluated at 512x512 resolution.  Not optimized for FID scores.
 ## Environmental Impact
@@ -137,4 +137,3 @@ Based on that information, we estimate the following CO2 emissions using the [Ma
     }
 
 *This model card was written by: Robin Rombach and Patrick Esser and is based on the [DALL-E Mini model card](https://huggingface.co/dalle-mini/dalle-mini).*
-
@@ -582,4 +582,4 @@
       "outputs": []
     }
   ]
-}
+}
@@ -23,7 +23,7 @@ Hopefully demand will be high, we want to train **hundreds** of new concepts!
 
 # What does `most inventive use` mean?
 
-Whatever you want it to mean! be creative! experiment! 
+Whatever you want it to mean! be creative! experiment!
 
 There are several categories we will look at:
 
@@ -33,7 +33,7 @@ There are several categories we will look at:
 
 * composition; meaning anything related to how big things are, their position, the angle, etc
 
-* styling; 
+* styling;
 
 ![image](https://user-images.githubusercontent.com/106811348/197045629-029ba6f5-1f79-475c-9ce7-969aaf3d253b.png)
 
@@ -45,7 +45,7 @@ There are several categories we will look at:
 
 ## `The Sims(TM): Stable Diffusion edition` ?
 
-For this event the theme is “The Sims: Stable Diffusion edition”. 
+For this event the theme is “The Sims: Stable Diffusion edition”.
 
 So we have selected a subset of [products from Amazon Berkely Objects dataset](https://github.com/sd-webui/abo).
 
 
@@ -17,5 +17,5 @@
   "type_vocab_size": 2,
   "vocab_size": 30522,
   "encoder_width": 768,
-  "add_cross_attention": true   
+  "add_cross_attention": true
 }
@@ -21,7 +21,7 @@ init_lr: 1e-5
 image_size: 384
 
 # generation configs
-max_length: 20  
+max_length: 20
 min_length: 5
 num_beams: 3
 prompt: 'a picture of '
@@ -30,4 +30,3 @@ prompt: 'a picture of '
 weight_decay: 0.05
 min_lr: 0
 max_epoch: 5
-
@@ -17,5 +17,5 @@
   "type_vocab_size": 2,
   "vocab_size": 30524,
   "encoder_width": 768,
-  "add_cross_attention": true   
+  "add_cross_attention": true
 }
@@ -1,13 +1,13 @@
-image_root: '/export/share/datasets/vision/NLVR2/' 
+image_root: '/export/share/datasets/vision/NLVR2/'
 ann_root: 'annotation'
 
 # set pretrained as a file path or an url
 pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth'
 
 #size of vit model; base or large
 vit: 'base'
-batch_size_train: 16 
-batch_size_test: 64 
+batch_size_train: 16
+batch_size_test: 64
 vit_grad_ckpt: False
 vit_ckpt_layer: 0
 max_epoch: 15
@@ -18,4 +18,3 @@ image_size: 384
 weight_decay: 0.05
 init_lr: 3e-5
 min_lr: 0
-
@@ -12,4 +12,4 @@ image_size: 384
 max_length: 20
 min_length: 5
 num_beams: 3
-prompt: 'a picture of '
+prompt: 'a picture of '
@@ -1,7 +1,7 @@
 train_file: ['/export/share/junnan-li/VL_pretrain/annotation/coco_karpathy_train.json',
              '/export/share/junnan-li/VL_pretrain/annotation/vg_caption.json',
              ]
-laion_path: ''   
+laion_path: ''
 
 # size of vit model; base or large
 vit: 'base'
@@ -22,6 +22,3 @@ warmup_lr: 1e-6
 lr_decay_rate: 0.9
 max_epoch: 20
 warmup_steps: 3000
-
-
-
@@ -31,4 +31,3 @@ negative_all_rank: True
 weight_decay: 0.05
 min_lr: 0
 max_epoch: 6
-
@@ -31,4 +31,3 @@ negative_all_rank: False
 weight_decay: 0.05
 min_lr: 0
 max_epoch: 6
-
@@ -9,4 +9,4 @@ vit: 'base'
 batch_size: 64
 k_test: 128
 image_size: 384
-num_frm_test: 8
+num_frm_test: 8
@@ -8,8 +8,8 @@ pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/mo
 
 # size of vit model; base or large
 vit: 'base'
-batch_size_train: 16 
-batch_size_test: 32 
+batch_size_train: 16
+batch_size_test: 32
 vit_grad_ckpt: False
 vit_ckpt_layer: 0
 init_lr: 2e-5
@@ -22,4 +22,4 @@ inference: 'rank'
 # optimizer
 weight_decay: 0.05
 min_lr: 0
-max_epoch: 10
+max_epoch: 10
@@ -83,4 +83,4 @@ lightning:
         increase_log_steps: False
 
   trainer:
-    benchmark: True
+    benchmark: True
@@ -95,4 +95,4 @@ lightning:
         increase_log_steps: False
 
   trainer:
-    benchmark: True
+    benchmark: True
@@ -15,7 +15,7 @@ model:
     conditioning_key: crossattn
     monitor: val/loss
     use_ema: False
-    
+
     unet_config:
       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
       params:
@@ -37,7 +37,7 @@ model:
         use_spatial_transformer: true
         transformer_depth: 1
         context_dim: 512
-    
+
     first_stage_config:
       target: ldm.models.autoencoder.VQModelInterface
       params:
@@ -59,7 +59,7 @@ model:
           dropout: 0.0
         lossconfig:
           target: torch.nn.Identity
-    
+
     cond_stage_config:
       target: ldm.modules.encoders.modules.ClassEmbedder
       params:
 
@@ -82,4 +82,4 @@ lightning:
         increase_log_steps: False
 
   trainer:
-    benchmark: True
+    benchmark: True
Original file line number	Diff line number	Diff line change
`@@ -582,4 +582,4 @@`
`582`	`582`	`"outputs": []`
`583`	`583`	`}`
`584`	`584`	`]`
`585`		`-}`
	`585`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -17,5 +17,5 @@`
`17`	`17`	`"type_vocab_size": 2,`
`18`	`18`	`"vocab_size": 30522,`
`19`	`19`	`"encoder_width": 768,`
`20`		`- "add_cross_attention": true`
	`20`	`+ "add_cross_attention": true`
`21`	`21`	`}`