[Contribs] Add CPS

PaddlePaddle · Apr 18, 2023 · fc8aec5 · fc8aec5
2 parents facfad1 + f9f5979
commit fc8aec5
Show file tree

Hide file tree

Showing 20 changed files with 822 additions and 59 deletions.
diff --git a/contrib/CrossPseudoSupervision/README.md b/contrib/CrossPseudoSupervision/README.md
@@ -111,6 +111,7 @@ python -m paddle.distributed.launch --gpus="0,1,2,3" train.py --config ./configs
 | ---------- | ---- | ---- | ---- | ---- |
 | nepochs | 128  | 137  | 160  | 240  |
 
+
 ### Evaluation
 
 After training, execute the following commands to evaluate the model accuracy:

diff --git a/contrib/CrossPseudoSupervision/README_CN.md b/contrib/CrossPseudoSupervision/README_CN.md
@@ -4,7 +4,7 @@
 
 不同于图像分类任务，**数据的标注对于语义分割任务来说是比较困难且成本高昂的**。图像中的每个像素都需要有一个标签，包括一些特别细节的物体，如电线杆等。与对像素的密集标注相比，获取原始RGB数据相对简单。因此，**如何利用大量的无标注数据提升模型的性能，是半监督语义分割领域的研究热点**。
 
-[Cross pseudo supervision, CPS](https://arxiv.org/abs/2106.01226)是一种**简洁而高性能**的半监督语义分割任务算法。在训练时，使用两个相同结构、但是初始化状态不同的网络，添加约束**使得两个网络对同一样本的输出是相似的**。具体来说，一个网络生成的one-hot伪标签将作为训练另一个网络的目标。这个过程可以用交叉熵损失函数监督，就像传统的监督学习语义分割任务的一样。**该算法在在两个benchmark (PASCAL VOC, Cityscapes) 都取得了最先进的结果**
+[Cross pseudo supervision, CPS](https://arxiv.org/abs/2106.01226)是一种**简洁而高性能**的半监督语义分割任务算法。在训练时，使用两个相同结构、但是初始化状态不同的网络，添加约束**使得两个网络对同一样本的输出是相似的**。具体来说，一个网络生成的one-hot伪标签将作为训练另一个网络的目标。这个过程可以用交叉熵损失函数监督，就像传统的监督学习语义分割任务的一样。**该算法在在两个benchmark (PASCAL VOC, Cityscapes) 都取得了最先进的结果**。
 
 部分可视化结果如下（左边为RGB图像，中间为预测图，右边为真值）:
 
@@ -105,7 +105,7 @@ python -m paddle.distributed.launch --gpus="0,1,2,3" train.py --config ./configs
 --log_iters 10 --save_dir $SAVE_PATH$ --batch_size 8
 ```
 
-- `SAVE_PATH`: 保存权重与日志等文件的文件夹路径
+- `SAVE_PATH`: 保存权重与日志等文件的文件夹路径。
 
 **注**：
 1. 配置文件是训练1/2有标签的数据，若要调整为其他比例，修改配置文件中的`labeled_ratio`参数。当修改有标签数据的比例时，训练的epoch数需要按照下表进行调整（通过修改配置文件中的`nepochs`参数调整训练的epoch数量）：
@@ -126,7 +126,7 @@ python val.py \
        --model_path $MODEL_PATH$
 ```
 
-- `MODEL_PATH`: 加载的权重的路径
+- `MODEL_PATH`: 要加载的权重路径。
 
 ### 预测
 
@@ -144,6 +144,6 @@ export CUDA_VISIBLE_DEVICES=0
        --stride 532 532
 ```
 
-- `IMG_PATH`: 待预测的图片或文件夹所在的路径
+- `IMG_PATH`: 待预测的图片或文件夹所在的路径。
 
 本项目提供[预训练模型]()可供直接进行预测。
diff --git a/contrib/CrossPseudoSupervision/core/train.py b/contrib/CrossPseudoSupervision/core/train.py
@@ -126,11 +126,11 @@ def train(model,
         mask_generator (batch_transforms.mask_gen.BoxMaskGenerator): Cutmix used for training.
         unsupervised_train_dataset (paddle.io.Dataset, optional): Used to read and process training datasets do not have labels.
         val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets.
-        optimizer_l (paddle.optimizer.Optimizer, optional): The optimizer for sub model first.
-        optimizer_r (paddle.optimizer.Optimizer, optional): The optimizer for sub model second
+        optimizer_l (paddle.optimizer.Optimizer, optional): The optimizer for the first sub-model.
+        optimizer_r (paddle.optimizer.Optimizer, optional): The optimizer for the second sub-model.
         save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'.
         nepochs (int, optional): How may epochs to train the model. Defualt: 240.
-        labeled_ratio (int, optional): The ratio of total data to marked data, if 2, we use the ratio of 1/2, i.e. 0.5. Default: 2. 
+        labeled_ratio (int, optional): The ratio of total data to marked data. If 2, we use the ratio of 1/2, i.e. 0.5. Default: 2. 
         batch_size (int, optional): Mini batch size of total gpus or cpu. Default: 8.
         resume_model (str, optional): The path of resume model.
         save_epoch (int, optional): How many epochs to save a model snapshot once during training. Default: 5.
@@ -309,7 +309,7 @@ def train(model,
                             batch_cost_averager.get_ips_average(), eta))
                 if use_vdl:
                     log_writer.add_scalar('Train/loss', avg_loss, current_iter)
-                    # Record all losses if there are more than 1 losses.
+                    # Record all losses if there are more than 1 loss.
                     if len(avg_loss_list) > 1:
                         avg_loss_dict = {}
                         for i, value in enumerate(avg_loss_list):

diff --git a/contrib/CrossPseudoSupervision/cvlibs/builder.py b/contrib/CrossPseudoSupervision/cvlibs/builder.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
 
 import copy
 from typing import Any, Optional
-import yaml
 
+import yaml
 import paddle
 from paddleseg.utils import logger
 from paddleseg.cvlibs import Builder
@@ -94,8 +94,10 @@ def _build_lr_scheduler(self, lr_cfg) -> paddle.optimizer.lr.LRScheduler:
             end_lr = lr_cfg['learning_rate']
 
         # calculate iters
-        num_train_imgs = 2975 // self.config.labeled_ratio
-        num_unsup_imgs = 2975 - num_train_imgs
+        total_imgs = len(self.train_dataset) + len(
+            self.unsupervised_train_dataset)
+        num_train_imgs = total_imgs // self.config.labeled_ratio
+        num_unsup_imgs = total_imgs - num_train_imgs
         max_samples = max(num_train_imgs, num_unsup_imgs)
         niters_per_epoch = max_samples // self.config.batch_size
         iters = niters_per_epoch * self.config.nepochs

diff --git a/contrib/CrossPseudoSupervision/datasets/basedataset.py b/contrib/CrossPseudoSupervision/datasets/basedataset.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+
 import paddle
 import numpy as np
 

diff --git a/contrib/CrossPseudoSupervision/datasets/cityscapes_cps.py b/contrib/CrossPseudoSupervision/datasets/cityscapes_cps.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import os
-import numpy as np
 
+import numpy as np
 from paddleseg.transforms import Compose
 
 from cvlibs import manager
@@ -26,7 +26,7 @@ class CityscapesCPS(BaseDataset):
     """
     Semi-supervision Cityscapes dataset with images, segmentation labels and data list.
     Source: https://www.cityscapes-dataset.com/
-    Semi-supervision Cityscapes dataset from [google drive](https://pkueducn-my.sharepoint.com/:f:/g/personal/pkucxk_pku_edu_cn/EtjNKU0oVMhPkOKf9HTPlVsBIHYbACel6LSvcUeP4MXWVg?e=139icd)
+    Semi-supervision Cityscapes dataset from [OneDrive](https://pkueducn-my.sharepoint.com/:f:/g/personal/pkucxk_pku_edu_cn/EtjNKU0oVMhPkOKf9HTPlVsBIHYbACel6LSvcUeP4MXWVg?e=139icd)
 
     The folder structure is as follows:
 

diff --git a/contrib/CrossPseudoSupervision/models/backbones/resnet.py b/contrib/CrossPseudoSupervision/models/backbones/resnet.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import division
-from __future__ import print_function
-
 import paddle.nn as nn
 from paddleseg.utils import utils
 from paddleseg.models import layers

diff --git a/contrib/CrossPseudoSupervision/models/initializer.py b/contrib/CrossPseudoSupervision/models/initializer.py
@@ -18,8 +18,8 @@
 """
 
 import math
-import numpy as np
 
+import numpy as np
 import paddle
 import paddle.nn as nn
 
@@ -74,7 +74,7 @@ def uniform_(tensor, a, b):
 
 def normal_(tensor, mean=0., std=1.):
     """
-    Modify tensor in space using normal_
+    Modify tensor in place using normal_
     Args:
         tensor (paddle.Tensor): paddle tensor.
         mean (float|int): mean value.
@@ -288,7 +288,7 @@ def reset_initialized_parameter(model, include_self=True):
     Reset initialized parameter for [conv, linear, embedding, bn]
     Args:
         model (paddle.Layer): paddle Layer.
-        include_self (bool): include_self for Layer.named_sublayers method. Indicate whether including `model` itself, default to False.
+        include_self (bool): include_self for Layer.named_sublayers method. Indicate whether to include `model` itself, default to False.
     Return:
         None
     """

diff --git a/contrib/CrossPseudoSupervision/utils/utils.py b/contrib/CrossPseudoSupervision/utils/utils.py
@@ -18,7 +18,7 @@
 
 from paddleseg.utils import logger
 
-__all__ = ['cps_resume', 'get_in_channels', 'get_in_channels']
+__all__ = ['cps_resume', 'get_in_channels', 'set_in_channels']
 
 
 def cps_resume(model, optimizer_l, optimizer_r, resume_model):

diff --git a/contrib/QualityInspector/README.md b/contrib/QualityInspector/README.md
@@ -4,7 +4,7 @@
 
 在3C电子、汽车、纺织化纤、金属、建筑、食品、日用消费品等生产制造行业，质量检测是保障产品质量的重要一环，是企业确保产品符合标准、满足客户需求、提高竞争力的关键步骤。在深度学习时代，AI赋能工业质检成为大势所趋。传统的数字图像处理方法虽然比起人工检测已经有了很大进步，但往往存在精度不高、泛化性差等问题。基于深度学习的方法能够很大程度上缓解这些问题，给很多复杂的质检场景带来自动化的可能性。
 
-基于飞桨的计算机视觉开发套件PaddleClass、PaddleSeg、PaddleDetection三件套，已经可以解决很多质检问题，但是这些套件没有提供针对工业质检场景的数据预处理、后处理、评测指标等配套工具，没有提供针对工业质检的特色模型，且不能有效支持需要这些套件联动来解决问题的场景。
+基于飞桨的计算机视觉开发套件PaddleClas、PaddleSeg、PaddleDetection三件套，已经可以解决很多质检问题，但是这些套件没有提供针对工业质检场景的数据预处理、后处理、评测指标等配套工具，没有提供针对工业质检的特色模型，且不能有效支持需要这些套件联动来解决问题的场景。
 
 **QualityInspector工业质检全流程解决方案开发工具，致力于帮助开发者快速完成算法的研发、验证和调优，端到端完成从数据标注到模型部署的全流程工业质检应用。**
 
@@ -68,7 +68,7 @@ QualityInspector部分可视化效果如下：
   * 🔥 获取PaddleSeg的全流程产业实操范例，包括质检缺陷分割、抠图Matting、道路分割等等
 
 <div align="center">
-<img src="https://user-images.githubusercontent.com/48433081/174770518-e6b5319b-336f-45d9-9817-da12b1961fb1.jpg"  width = "200" />  
+<img src="https://user-images.githubusercontent.com/30883834/213601179-0813a896-11e1-4514-b612-d145e068ba86.jpeg"  width = "200" />  
 </div>
 
 

diff --git a/contrib/SegmentAnything/README.md b/contrib/SegmentAnything/README.md
@@ -1,25 +1,24 @@
 # Segment Anything with PaddleSeg
 
-## Reference
-
-> Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alexander C. Berg, Wan-Yen Lo, Piotr Dollár, Ross Girshick. [Segment Anything](https://ai.facebook.com/research/publications/segment-anything/).
-
 
 ## Contents
 1. Overview
 2. Performance
 3. Try it by yourself with one line of code
+4. Reference
+
 
 ## <img src="https://user-images.githubusercontent.com/34859558/190043857-bfbdaf8b-d2dc-4fff-81c7-e0aac50851f9.png" width="25"/> Overview
 
-We implemente the segment anything with the PaddlePaddle framework. **Segment Anything Model (SAM)** is a new task, model, and dataset for image segmentation.  It can produce high quality object masks from different types of prompts including points, boxes, masks and text. Further, SAM can generate masks for all objects in whole image. It built a largest segmentation [dataset](https://segment-anything.com/dataset/index.html) to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. SAM has impressive zero-shot performance on a variety of tasks, even often competitive with or even superior to prior fully supervised results.
+We implemente the segment anything with the PaddlePaddle framework. **Segment Anything Model (SAM)** is a new task, model, and dataset for image segmentation. It built a largest segmentation [dataset](https://segment-anything.com/dataset/index.html) to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. Further, SAM can produce high quality object masks from different types of prompts including points, boxes, masks and text. SAM has impressive zero-shot performance on a variety of tasks, even often competitive with or even superior to prior fully supervised results. However, the SAM model based on text prompt is not released at the moment. Therefore, we use a combination of **SAM** and **CLIP** to calculate the similarity between the output masks and text prompt. In this way, you can use **text prompt** to segment anything. In addition, we also implement SAM that can generate masks for all objects in whole image.
+
 
-We provide the pretrained model parameters of PaddlePaddle format, including [vit_b](https://bj.bcebos.com/paddleseg/dygraph/paddlesegAnything/vit_b/model.pdparams), [vit_l](https://bj.bcebos.com/paddleseg/dygraph/paddlesegAnything/vit_l/model.pdparams) and [vit_h](https://bj.bcebos.com/paddleseg/dygraph/paddlesegAnything/vit_h/model.pdparams).
+We provide the pretrained model parameters of PaddlePaddle format, including [vit_b](https://bj.bcebos.com/paddleseg/dygraph/paddlesegAnything/vit_b/model.pdparams), [vit_l](https://bj.bcebos.com/paddleseg/dygraph/paddlesegAnything/vit_l/model.pdparams) and [vit_h](https://bj.bcebos.com/paddleseg/dygraph/paddlesegAnything/vit_h/model.pdparams). For text prompt, we also provide the [CLIP_ViT_B](https://bj.bcebos.com/paddleseg/dygraph/clip/vit_b_32_pretrain/clip_vit_b_32.pdparams) model parameters of PaddlePaddle format.
 
 ## <img src="https://user-images.githubusercontent.com/34859558/190044217-8f6befc2-7f20-473d-b356-148e06265205.png" width="25"/> Performance
 
 <div align="center">
-<img src="https://github.com/Sunting78/images/blob/master/sam_new.gif"  width="1000" />
+<img src="https://user-images.githubusercontent.com/18344247/232466911-f8d1c016-2eb2-46aa-94e2-3ec435f38502.gif"  width="1000" />
 </div>
 
 
@@ -33,44 +32,51 @@ We provide the pretrained model parameters of PaddlePaddle format, including [vi
     git clone https://github.com/PaddlePaddle/PaddleSeg.git
     cd PaddleSeg
     pip install -r requirements.txt
+    pip install ftfy regex
+    cd contrib/SegmentAnything/
     ```
-* Download the example image to ```contrib/SegmentAnything/examples```, and the file structure is as following:
+* Download the example image to ```contrib/SegmentAnything/examples``` and the vocab to ```contrib/SegmentAnything/```
     ```bash
     wget https://paddleseg.bj.bcebos.com/dygraph/demo/cityscapes_demo.png
+    wget https://bj.bcebos.com/paddleseg/dygraph/bpe_vocab_16e6/bpe_simple_vocab_16e6.txt.gz
     ```
+    Then, the file structure is as following:
 
     ```
     PaddleSeg/contrib
     ├── SegmentAnything
     │   ├── examples
     │   │   └──  cityscapes_demo.png
     │   ├── segment_anything
-    │   └── scripts
+    │   ├── scripts
+    │   └── bpe_simple_vocab_16e6.txt.gz
 
     ```
+### 2. Segment Anything on webpage.
 
-### 2. Segment the whole image on webpage.
 In this step, we start a gradio service with the following scrip on local machine and you can try out our project with your own images.
+Based on this service, You can experience the ability to **segment the whole image** and **segment the object based on text prompts**.
 
 1. Run the following script:
     ```bash
-    python scripts/amg_paddle.py --model-type [vit_l/vit_b/vit_h] # default is vit_h
-
+    python scripts/text_to_sam_clip.py --model-type [vit_l/vit_b/vit_h] # default is vit_h
     ```
     Note:
-    *  There are three model options for you, vit_b, vit_l and vit_h, represent vit_base, vit_large and vit_huge. Large model is more accurate and also slower. You can choose the model size based on your device.
-    * The test result shows that vit_h needs 16G video memory and needs around 10s to infer an image on V100.
-
-2. Open the webpage on your localhost: ```http://0.0.0.0:8017```
+    *  There are three SAM model options for you, `vit_b`, `vit_l` and `vit_h`, represent vit_base, vit_large and vit_huge. Large model is more accurate but slower. You can choose the suitable model size based on your device.
+    * We support `CLIP Vit-B` model for extracting text and image features.
+    * `SAM vit_h` needs 16G memory and costs around 10s to infer an image on V100.
 
+2. Open the webpage on your localhost: ```http://0.0.0.0:8078```
 3. Try it out by clear and upload the test image! Our example looks like:
 
     <div align="center">
-    <img src="https://user-images.githubusercontent.com/34859558/230873989-9597527e-bef6-47ce-988b-977198794d75.jpg"  width = "1000" />  
+    <img src="https://user-images.githubusercontent.com/18344247/232427677-a7f913df-4abf-46ce-be2c-e37cbd495105.png"  width = "1000" />  
     </div>
 
-### 3. Segment the object with prompts
-You can run the following commands to produce masks from different types of prompts including points, boxes, and masks, as follow:
+
+### 3. Segment the object with point or box prompts
+
+You can run the following commands to produce masks from different types of prompts including points and boxes, as follow:
 
 
 1. Box prompt
@@ -84,10 +90,9 @@ python scripts/promt_predict.py --input_path xxx.png --box_prompt 1050 370 1500
 python scripts/promt_predict.py --input_path xxx.png --point_prompt 1200 450 --model-type [vit_l/vit_b/vit_h] # default is vit_h
 ```
 
-3. Mask prompt
-```bash
-python scripts/promt_predict.py --input_path xxx.png --mask_prompt xxx.png --model-type [vit_l/vit_b/vit_h] # default is vit_h
-```
 
-Note:
-* mask_prompt is the path of a binary image.
+## Reference
+
+> Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alexander C. Berg, Wan-Yen Lo, Piotr Dollár, Ross Girshick. [Segment Anything](https://ai.facebook.com/research/publications/segment-anything/).
+
+> Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever Proceedings of the 38th International Conference on Machine Learning, PMLR 139:8748-8763, 2021. [CLIP](https://github.com/openai/CLIP)
diff --git a/contrib/SegmentAnything/examples/dog.jpg b/contrib/SegmentAnything/examples/dog.jpg
diff --git a/contrib/SegmentAnything/examples/zixingche.jpeg b/contrib/SegmentAnything/examples/zixingche.jpeg
diff --git a/contrib/SegmentAnything/scripts/promt_predict.py b/contrib/SegmentAnything/scripts/promt_predict.py
@@ -39,7 +39,7 @@
 
 def get_args():
     parser = argparse.ArgumentParser(
-        description='Segment image with point promp, box or mask')
+        description='Segment image with point promp or box')
     # Parameters
     parser.add_argument(
         '--input_path', type=str, required=True, help='The directory of image.')
@@ -61,8 +61,6 @@ def get_args():
         nargs='+',
         default=None,
         help='box promt format as xyxy.')
-    parser.add_argument(
-        '--mask_prompt', type=str, default=None, help='The path of mask.')
     parser.add_argument(
         '--output_path',
         type=str,
@@ -88,18 +86,14 @@ def main(args):
         paddle.set_device("cpu")
     input_path = args.input_path
     output_path = args.output_path
-    point, box, mask_path = args.point_prompt, args.box_prompt, args.mask_prompt
+    point, box = args.point_prompt, args.box_prompt
     if point is not None:
         point = np.array([point])
         input_label = np.array([1])
     else:
         input_label = None
     if box is not None:
         box = np.array([[box[0], box[1]], [box[2], box[3]]])
-    if mask_path is not None:
-        mask = cv2.imread(mask_path, -1)
-    else:
-        mask = None
 
     image = cv2.imread(input_path)
     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
@@ -112,7 +106,6 @@ def main(args):
         point_coords=point,
         point_labels=input_label,
         box=box,
-        mask_input=mask,
         multimask_output=True, )
 
     plt.figure(figsize=(10, 10))