haosulab · StoneT2000 · Apr 25, 2024 · Mar 1, 2024 · Mar 1, 2024 · Mar 2, 2024
diff --git a/examples/baselines/ppo/README.md b/examples/baselines/ppo/README.md
@@ -1,22 +1,29 @@
 # Proximal Policy Optimization (PPO)
 
-Code for running the PPO RL algorithm is adapted from [CleanRL](https://github.com/vwxyzjn/cleanrl/). It is written to be a single-file and easy to follow/read
+Code for running the PPO RL algorithm is adapted from [CleanRL](https://github.com/vwxyzjn/cleanrl/). It is written to be single-file and easy to follow/read, and supports state-based RL and visual-based RL code.
+
+
+## State Based RL
+
+Below is a sample of various commands you can run to train a state-based policy to solve various tasks with PPO that are lightly tuned already. The fastest one is the PushCube-v1 task which can take less than a minute to train on the GPU and the PickCube-v1 task which can take 2-5 minutes on the GPU.
+
+The PPO baseline is not guaranteed to work for tasks not tested below as some tasks do not have dense rewards yet or well tuned ones, or simply are too hard with standard PPO (or our team has not had time to verify results yet)
 
-To train, you can run
 
 ```bash
 python ppo.py --env_id="PushCube-v1" \
   --num_envs=2048 --update_epochs=8 --num_minibatches=32 \
-  --total_timesteps=5_000_000 --eval_freq=10 --num-steps=20
+  --total_timesteps=2_000_000 --eval_freq=10 --num-steps=20
 ```
 
 To evaluate, you can run
 ```bash
-python ppo.py --env_id="PickCube-v1" \
-   --evaluate --num_eval_envs=1 --checkpoint=runs/PickCube-v1__ppo__1__1710225023/ppo_101.cleanrl_model
+python ppo.py --env_id="PushCube-v1" \
+   --evaluate --checkpoint=path/to/model.pt \
+   --num_eval_envs=1 --num-eval-steps=1000
 ```
 
-Note that with `--evaluate`, trajectories are saved from a GPU simulation. In order to support replaying these trajectories correctly with the `maniskill.trajectory.replay_trajectory` tool, the number of evaluation environments must be fixed to `1`. This is necessary in order to ensure reproducibility for tasks that have randomizations on geometry (e.g. PickSingleYCB).
+Note that with `--evaluate`, trajectories are saved from a GPU simulation. In order to support replaying these trajectories correctly with the `maniskill.trajectory.replay_trajectory` tool for some task, the number of evaluation environments must be fixed to `1`. This is necessary in order to ensure reproducibility for tasks that have randomizations on geometry (e.g. PickSingleYCB). Other tasks without geometrical randomization like PushCube are fine and you can increase the number of evaluation environments. 
 
 
 Below is a full list of various commands you can run to train a policy to solve various tasks with PPO that are lightly tuned already. The fastest one is the PushCube-v1 task which can take less than a minute to train on the GPU.
@@ -34,7 +41,7 @@ python ppo.py --env_id="PickSingleYCB-v1" \
   --total_timesteps=25_000_000
 python ppo.py --env_id="PegInsertionSide-v1" \
   --num_envs=1024 --update_epochs=8 --num_minibatches=32 \
-  --total_timesteps=150_000_000 --num-steps=100 --num-eval-steps=100
+  --total_timesteps=250_000_000 --num-steps=100 --num-eval-steps=100
 python ppo.py --env_id="TwoRobotStackCube-v1" \
    --num_envs=1024 --update_epochs=8 --num_minibatches=32 \
    --total_timesteps=40_000_000 --num-steps=100 --num-eval-steps=100
@@ -67,7 +74,48 @@ python ppo.py --env_id="UnitreeH1Stand-v1" \
 
 python ppo.py --env_id="OpenCabinetDrawer-v1" \
   --num_envs=1024 --update_epochs=8 --num_minibatches=32 \
-  --total_timesteps=10_000_000 --num-steps=100 --num-eval-steps=100
-  --gamma=0.9
-
-```
+  --total_timesteps=10_000_000 --num-steps=100 --num-eval-steps=100   
+```
+
+## Visual Based RL
+
+Below is a sample of various commands for training a image-based policy with PPO that are lightly tuned. The fastest again is also PushCube-v1 which can take about 1-5 minutes and PickCube-v1 which takes 30-60 minutes. You will need to tune the `--num_envs` argument according to how much GPU memory you have as rendering visual observations uses a lot of memory. The settings below should all take less than 15GB of GPU memory. Note that while if you have enough memory you can easily increase the number of environments, this does not necessarily mean wall-time or sample efficiency improve.
+
+The visual PPO baseline is not guaranteed to work for tasks not tested below as some tasks do not have dense rewards yet or well tuned ones, or simply are too hard with standard PPO (or our team has not had time to verify results yet)
+
+
+
+```bash
+python ppo_rgb.py --env_id="PushCube-v1" \
+  --num_envs=512 --update_epochs=8 --num_minibatches=16 \
+  --total_timesteps=1_000_000 --eval_freq=10 --num-steps=20
+python ppo_rgb.py --env_id="OpenCabinetDrawer-v1" \
+  --num_envs=256 --update_epochs=8 --num_minibatches=16 \
+  --total_timesteps=100_000_000 --num-steps=100 --num-eval-steps=100
+```
+
+To evaluate a trained policy you can run
+
+```bash
+python ppo_rgb.py --env_id="OpenCabinetDrawer-v1" \
+  --evaluate --checkpoint=path/to/model.pt \
+  --num_eval_envs=1 --num-eval-steps=1000
+```
+
+and it will save videos to the `path/to/test_videos`.
+
+## Replaying Evaluation Trajectories
+
+It might be useful to get some nicer looking videos. A simple way to do that is to first use the evaluation scripts provided above. It will then save a .h5 and .json file with a name equal to the date and time that you can then replay with different settings as so
+
+```bash
+python -m mani_skill.trajectory.replay_trajectory \
+  --traj-path=path/to/trajectory.h5 --use-env-states --shader="rt-fast" \
+  --save-video --allow-failure -o "none"
+```
+
+This will use environment states to replay trajectories, turn on the ray-tracer (There is also "rt" which is higher quality but slower), and save all videos including failed trajectories.
+
+## Some Notes
+
+- The code currently does not have the best way to evaluate the agents in that during GPU simulation, all assets are frozen per parallel environment (changing them slows training down). Thus when doing evaluation, even though we evaluate on multiple (8 is default) environments at once, they will always feature the same set of geometry. This only affects tasks where there is geometry variation (e.g. PickClutterYCB, OpenCabinetDrawer). You can make it more accurate by increasing the number of evaluation environments. Our team is discussing still what is the best way to evaluate trained agents properly without hindering performance.
diff --git a/examples/baselines/ppo/ppo.py b/examples/baselines/ppo/ppo.py
@@ -200,7 +200,7 @@ def get_action_and_value(self, x, action=None):
     if args.capture_video:
         eval_output_dir = f"runs/{run_name}/videos"
         if args.evaluate:
-            eval_output_dir = f"videos"
+            eval_output_dir = f"{os.path.dirname(args.checkpoint)}/test_videos"
         print(f"Saving eval videos to {eval_output_dir}")
         if args.save_train_video_freq is not None:
             save_video_trigger = lambda x : (x // args.num_steps) % args.save_train_video_freq == 0
@@ -283,7 +283,7 @@ def clip_action(action: torch.Tensor):
             if args.evaluate:
                 break
         if args.save_model and iteration % args.eval_freq == 1:
-            model_path = f"runs/{run_name}/{args.exp_name}_{iteration}.cleanrl_model"
+            model_path = f"runs/{run_name}/{args.exp_name}_{iteration}.pt"
             torch.save(agent.state_dict(), model_path)
             print(f"model saved to {model_path}")
         # Annealing the rate if instructed to do so.
@@ -292,6 +292,7 @@ def clip_action(action: torch.Tensor):
             lrnow = frac * args.learning_rate
             optimizer.param_groups[0]["lr"] = lrnow
 
+        rollout_time = time.time()
         for step in range(0, args.num_steps):
             global_step += args.num_envs
             obs[step] = next_obs
@@ -321,7 +322,7 @@ def clip_action(action: torch.Tensor):
                 writer.add_scalar("charts/episodic_length", final_info["elapsed_steps"][done_mask].cpu().numpy().mean(), global_step)
 
                 final_values[step, torch.arange(args.num_envs, device=device)[done_mask]] = agent.get_value(final_info["final_observation"][done_mask]).view(-1)
-
+        rollout_time = time.time() - rollout_time
         # bootstrap value according to termination and truncation
         with torch.no_grad():
             next_value = agent.get_value(next_obs).reshape(1, -1)
@@ -377,6 +378,7 @@ def clip_action(action: torch.Tensor):
         agent.train()
         b_inds = np.arange(args.batch_size)
         clipfracs = []
+        update_time = time.time()
         for epoch in range(args.update_epochs):
             np.random.shuffle(b_inds)
             for start in range(0, args.batch_size, args.minibatch_size):
@@ -393,6 +395,9 @@ def clip_action(action: torch.Tensor):
                     approx_kl = ((ratio - 1) - logratio).mean()
                     clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
 
+                if args.target_kl is not None and approx_kl > args.target_kl:
+                    break
+
                 mb_advantages = b_advantages[mb_inds]
                 if args.norm_adv:
                     mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)
@@ -427,12 +432,12 @@ def clip_action(action: torch.Tensor):
 
             if args.target_kl is not None and approx_kl > args.target_kl:
                 break
+        update_time = time.time() - update_time
 
         y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
         var_y = np.var(y_true)
         explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
 
-        # TRY NOT TO MODIFY: record rewards for plotting purposes
         writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
         writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
         writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
@@ -443,7 +448,9 @@ def clip_action(action: torch.Tensor):
         writer.add_scalar("losses/explained_variance", explained_var, global_step)
         print("SPS:", int(global_step / (time.time() - start_time)))
         writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
-
+        writer.add_scalar("charts/update_time", update_time, global_step)
+        writer.add_scalar("charts/rollout_time", rollout_time, global_step)
+        writer.add_scalar("charts/rollout_fps", args.num_envs * args.num_steps / rollout_time, global_step)
     if not args.evaluate:
         if args.save_model:
             model_path = f"runs/{run_name}/{args.exp_name}_final.cleanrl_model"