Progressive_view, batch_sizes can also change with resolution_milestones (#165)

voletiv · Vikram Voleti · web-flow · commit 8cbf8a879b30 · 2023-06-29T00:12:54.000-04:00
* Makes progressive increase of elevation, azimuth

* Zero123 phase 2 config and script

* Update DOCUMENTATION.md

---------

Co-authored-by: Vikram Voleti &lt;vikram@ip-26-0-153-234.us-west-2.compute.internal&gt;
diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md
@@ -28,10 +28,10 @@
 | ---------------------- | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | height                 | Union[int, List[int]] | Height of the rendered image in training, should be an integer or a list of integers. If a list of integers, the training height will change according to `resolution_milestones`. Default: 64                                                                                      |
 | width                  | Union[int, List[int]] | Width of the rendered image in training, should be an integer or a list of integers. If a list of integers, the training width will change according to `resolution_milestones`. Default: 64                                                                                        |
+| batch_size             | Union[int, List[int]] | Number of images per batch in training. If a list of integers, the batch_size will change according to `resolution_milestones`. Default: 1                                                                                                                                                                                                                                  |
 | resolution_milestones  | List[int]             | The steps where the training resolution will change, must be in ascending order and in the length of `len(height) - 1`. Default: []                                                                                                                                                 |
 | eval_height            | int                   | Height of the rendered image in validation/testing. Default: 512                                                                                                                                                                                                                    |
 | eval_width             | int                   | Width of the rendered image in validation/testing. Default: 512                                                                                                                                                                                                                     |
-| batch_size             | int                   | Number of images per batch in training. Default: 1                                                                                                                                                                                                                                  |
 | eval_batch_size        | int                   | Number of images per batch in validation/testing. DO NOT change this. Default: 1                                                                                                                                                                                                    |
 | elevation_range        | Tuple[float,float]    | Camera elevation angle range to sample from in training, in degrees. Default: (-10,90)                                                                                                                                                                                              |
 | azimuth_range          | Tuple[float,float]    | Camera azimuth angle range to sample from in training, in degrees. Default: (-180,180)                                                                                                                                                                                              |
@@ -47,6 +47,7 @@
 | eval_fovy_deg          | float                 | Camera field of view (FoV) along the y direction (vertical direction) in validation/testing, in degrees. Default: 70                                                                                                                                                                |
 | light_sample_strategy  | str                   | Strategy to sample point light positions in training, in ["dreamfusion", "magic3d"]. "dreamfusion" uses strategy described in the DreamFusion paper; "magic3d" uses strategy decribed in the Magic3D paper. Default: "dreamfusion"                                                  |
 | batch_uniform_azimuth  | bool                  | Whether to ensure the uniformity of sampled azimuth angles in training as described in the Fantasia3D paper. If True, the `azimuth_range` is equally divided into `batch_size` bins and the azimuth angles are sampled from every bins. Default: True                               |
+| progressive_until  | int                  | Number of iterations until which to progressively (linearly) increase elevation_range and azimuth_range from [`eval_elevation_deg`, `eval_elevation_deg`] and `[0.0, 0.0]`, to those values specified in `elevation_range` and `azimuth_range`. 0 means the range does not linearly increase. Default: 0                               |
 
 ## Systems
 
diff --git a/configs/experimental/imagecondition_zero123nerf.yaml b/configs/experimental/imagecondition_zero123nerf.yaml
@@ -0,0 +1,166 @@
+name: "imagecondition"
+tag: "${rmspace:${system.prompt_processor.prompt},_}"
+exp_root_dir: "outputs"
+seed: 0
+
+data_type: "single-image-datamodule"
+data:
+  image_path: ./load/images/hamburger_rgba.png
+  height: 256
+  width: 256
+  default_elevation_deg: 0.0
+  default_azimuth_deg: 0.0
+  default_camera_distance: 3.8
+  default_fovy_deg: 20.0
+  random_camera:
+    batch_size: 4
+    height: 256
+    width: 256
+    eval_height: 512
+    eval_width: 512
+    eval_batch_size: 1
+    elevation_range: [-10, 80]
+    azimuth_range: [-180, 180]
+    camera_distance_range: [3.8, 3.8]
+    fovy_range: [20.0, 20.0] # Zero123 has fixed fovy
+    progressive_until: 0
+    camera_perturb: 0.0
+    center_perturb: 0.0
+    up_perturb: 0.0
+    light_position_perturb: 1.0
+    light_distance_range: [7.5, 10.0]
+    eval_elevation_deg: ${data.default_elevation_deg}
+    eval_camera_distance: ${data.default_camera_distance}
+    eval_fovy_deg: ${data.default_fovy_deg}
+    light_sample_strategy: "dreamfusion"
+    batch_uniform_azimuth: False
+    n_val_views: 30
+    n_test_views: 120
+
+system_type: "image-condition-dreamfusion-system"
+system:
+  geometry_type: "implicit-volume"
+  geometry:
+    radius: 2.0
+    normal_type: "analytic"
+
+    # the density initialization proposed in the DreamFusion paper
+    # does not work very well
+    # density_bias: "blob_dreamfusion"
+    # density_activation: exp
+    # density_blob_scale: 5.
+    # density_blob_std: 0.2
+
+    # use Magic3D density initialization instead
+    density_bias: "blob_magic3d"
+    density_activation: softplus
+    density_blob_scale: 10.
+    density_blob_std: 0.5
+
+    # coarse to fine hash grid encoding
+    # to ensure smooth analytic normals
+    pos_encoding_config:
+      otype: HashGrid
+      n_levels: 16
+      n_features_per_level: 2
+      log2_hashmap_size: 19
+      base_resolution: 16
+      per_level_scale: 1.447269237440378 # max resolution 4096
+    mlp_network_config:
+      otype: "VanillaMLP"
+      activation: "ReLU"
+      output_activation: "none"
+      n_neurons: 64
+      n_hidden_layers: 2
+
+  material_type: "diffuse-with-point-light-material"
+  material:
+    ambient_only_steps: 100000
+    textureless_prob: 0.05
+    albedo_activation: sigmoid
+
+  background_type: "neural-environment-map-background"
+  background:
+    color_activation: sigmoid
+
+  renderer_type: "nerf-volume-renderer"
+  renderer:
+    radius: ${system.geometry.radius}
+    num_samples_per_ray: 512
+    return_comp_normal: ${gt0:${system.loss.lambda_normal_smooth}}
+    return_normal_perturb: ${gt0:${system.loss.lambda_3d_normal_smooth}}
+
+  prompt_processor_type: "stable-diffusion-prompt-processor"
+  prompt_processor:
+    pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
+    prompt: "a DSLR photo of a delicious hamburger"
+
+  guidance_type: "stable-diffusion-guidance"
+  guidance:
+    pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
+    guidance_scale: 7.5
+    min_step_percent: 0.2
+    # min_step_percent: [0, 0.66, 0.33, 2000]  # (start_iter, start_val, end_val, end_iter)
+    max_step_percent: 0.6
+    # max_step_percent: [0, 0.98, 0.66, 2000]
+
+  # prompt_processor_type: "deep-floyd-prompt-processor"
+  # prompt_processor:
+  #   pretrained_model_name_or_path: "DeepFloyd/IF-I-XL-v1.0"
+  #   prompt: "a DSLR photo of a delicious hamburger"
+
+  # guidance_type: "deep-floyd-guidance"
+  # guidance:
+  #   pretrained_model_name_or_path: "DeepFloyd/IF-I-XL-v1.0"
+  #   guidance_scale: 7.5
+  #   min_step_percent: 0.2
+  #   # min_step_percent: [0, 0.66, 0.33, 2000]  # (start_iter, start_val, end_val, end_iter)
+  #   max_step_percent: 0.6
+  #   # max_step_percent: [0, 0.98, 0.66, 2000]
+
+  freq:
+    ref_only_steps: 0
+    guidance_eval: 13
+
+  loggers:
+    wandb:
+      enable: false
+      project: 'threestudio'
+      name: None
+
+  loss:
+    lambda_sds: 0.1
+    lambda_rgb: 400.0
+    lambda_mask: 50.0
+    lambda_depth: 0.05
+    lambda_normal_smooth: 2.0
+    lambda_3d_normal_smooth: 5.0
+    lambda_orient: 0.01
+    lambda_sparsity: 0.01
+    lambda_opaque: 0.05
+
+  optimizer:
+    name: Adan
+    args:
+      lr: 0.005
+      max_grad_norm: 5.0
+      eps: 1.e-8
+      weight_decay: 1e-5
+    params:
+      geometry:
+        lr: ${system.optimizer.args.lr}
+      background:
+        lr: 0.0
+
+trainer:
+  max_steps: 2000
+  log_every_n_steps: 1
+  num_sanity_val_steps: 0
+  val_check_interval: 20
+  enable_progress_bar: true
+  precision: 16-mixed
+
+checkpoint:
+  save_last: true # save at each validation time
+  save_top_k: -1
+  every_n_train_steps: 20 # ${trainer.max_steps}
diff --git a/configs/zero123.yaml b/configs/zero123.yaml
@@ -1,5 +1,5 @@
 name: "zero123"
-tag: "${data.random_camera.height}_${rmspace:${basename:${data.image_path}},_}"
+tag: "${data.random_camera.height}_${rmspace:${basename:${data.image_path}},_}_prog${data.random_camera.progressive_until}"
 exp_root_dir: "outputs"
 seed: 0
 
@@ -13,16 +13,18 @@ data: # threestudio/data/image.py -> SingleImageDataModuleConfig
   default_camera_distance: 3.8
   default_fovy_deg: 20.0
   random_camera: # threestudio/data/uncond.py -> RandomCameraDataModuleConfig
-    height: 64
-    width: 64
+    height: [64, 128]
+    width: [64, 128]
+    batch_size: [12, 6]
+    resolution_milestones: [200]
     eval_height: 256
     eval_width: 256
-    batch_size: 12
     eval_batch_size: 1
     elevation_range: [-10, 80]
     azimuth_range: [-180, 180]
     camera_distance_range: [3.8, 3.8]
-    fovy_range: [20.0, 20.0]
+    fovy_range: [20.0, 20.0] # Zero123 has fixed fovy
+    progressive_until: 0
     camera_perturb: 0.0
     center_perturb: 0.0
     up_perturb: 0.0
@@ -70,7 +72,7 @@ system:
       activation: "ReLU"
       output_activation: "none"
       n_neurons: 64
-      n_hidden_layers: 1
+      n_hidden_layers: 2
 
   material_type: "diffuse-with-point-light-material"
   material:
@@ -122,14 +124,14 @@ system:
       name: None
 
   loss:
-    lambda_sds: 0.03
+    lambda_sds: 0.05
     lambda_rgb: 500.
     lambda_mask: 50.
     lambda_depth: 0.05
     lambda_normal_smooth: 5.0
-    lambda_3d_normal_smooth: 2.0
+    lambda_3d_normal_smooth: 5.0
     lambda_orient: 1.0
-    lambda_sparsity: 0.1 # should be tweaked for every model
+    lambda_sparsity: 0.2 # should be tweaked for every model
     lambda_opaque: 0.05
 
   optimizer:
@@ -143,13 +145,13 @@ system:
       geometry:
         lr: ${system.optimizer.args.lr}
       background:
-        lr: ${system.optimizer.args.lr}
+        lr: 0.0
 
 trainer:
-  max_steps: 1999
+  max_steps: 300
   log_every_n_steps: 1
   num_sanity_val_steps: 0
-  val_check_interval: 100
+  val_check_interval: 50
   enable_progress_bar: true
   precision: 16-mixed
 
diff --git a/threestudio/data/image.py b/threestudio/data/image.py
@@ -15,6 +15,7 @@
     RandomCameraDataset,
     RandomCameraIterableDataset,
 )
+from threestudio.utils.base import Updateable
 from threestudio.utils.config import parse_structured
 from threestudio.utils.misc import get_rank
 from threestudio.utils.ops import (
@@ -154,7 +155,7 @@ def get_all_images(self):
         return self.rgb
 
 
-class SingleImageIterableDataset(IterableDataset, SingleImageDataBase):
+class SingleImageIterableDataset(IterableDataset, SingleImageDataBase, Updateable):
     def __init__(self, cfg: Any, split: str) -> None:
         super().__init__()
         self.setup(cfg, split)
@@ -178,6 +179,9 @@ def collate(self, batch) -> Dict[str, Any]:
 
         return batch
 
+    def update_step(self, epoch: int, global_step: int, on_load_weights: bool = False):
+        self.random_pose_generator.update_step(epoch, global_step, on_load_weights)
+
     def __iter__(self):
         while True:
             yield {}
diff --git a/threestudio/data/uncond.py b/threestudio/data/uncond.py
diff --git a/threestudio/scripts/run_zero123_phase2.sh b/threestudio/scripts/run_zero123_phase2.sh