vwxyzjn · vwxyzjn · Mar 11, 2022 · Mar 9, 2022 · Mar 9, 2022 · Mar 9, 2022
diff --git a/cleanrl/c51.py b/cleanrl/c51.py
@@ -166,7 +166,7 @@ def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
         # ALGO LOGIC: put action logic here
         epsilon = linear_schedule(args.start_e, args.end_e, args.exploration_fraction * args.total_timesteps, global_step)
         if random.random() < epsilon:
-            actions = envs.action_space.sample()
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
         else:
             actions, pmf = q_network.get_action(torch.Tensor(obs).to(device))
             actions = actions.cpu().numpy()

diff --git a/cleanrl/c51_atari.py b/cleanrl/c51_atari.py
@@ -187,7 +187,7 @@ def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
         # ALGO LOGIC: put action logic here
         epsilon = linear_schedule(args.start_e, args.end_e, args.exploration_fraction * args.total_timesteps, global_step)
         if random.random() < epsilon:
-            actions = envs.action_space.sample()
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
         else:
             actions, pmf = q_network.get_action(torch.Tensor(obs).to(device))
             actions = actions.cpu().numpy()

diff --git a/cleanrl/ddpg_continuous_action.py b/cleanrl/ddpg_continuous_action.py
@@ -161,7 +161,7 @@ def forward(self, x):
     for global_step in range(args.total_timesteps):
         # ALGO LOGIC: put action logic here
         if global_step < args.learning_starts:
-            actions = envs.action_space.sample()
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
         else:
             actions = actor(torch.Tensor(obs).to(device))
             actions = np.array(

diff --git a/cleanrl/dqn.py b/cleanrl/dqn.py
@@ -151,7 +151,7 @@ def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
         # ALGO LOGIC: put action logic here
         epsilon = linear_schedule(args.start_e, args.end_e, args.exploration_fraction * args.total_timesteps, global_step)
         if random.random() < epsilon:
-            actions = envs.action_space.sample()
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
         else:
             logits = q_network(torch.Tensor(obs).to(device))
             actions = torch.argmax(logits, dim=1).cpu().numpy()

diff --git a/cleanrl/dqn_atari.py b/cleanrl/dqn_atari.py
@@ -172,7 +172,7 @@ def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
         # ALGO LOGIC: put action logic here
         epsilon = linear_schedule(args.start_e, args.end_e, args.exploration_fraction * args.total_timesteps, global_step)
         if random.random() < epsilon:
-            actions = envs.action_space.sample()
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
         else:
             logits = q_network(torch.Tensor(obs).to(device))
             actions = torch.argmax(logits, dim=1).cpu().numpy()

diff --git a/cleanrl/sac_continuous_action.py b/cleanrl/sac_continuous_action.py
@@ -212,7 +212,7 @@ def to(self, device):
     for global_step in range(args.total_timesteps):
         # ALGO LOGIC: put action logic here
         if global_step < args.learning_starts:
-            actions = envs.action_space.sample()
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
         else:
             actions, _, _ = actor.get_action(torch.Tensor(obs).to(device))
             actions = actions.detach().cpu().numpy()

diff --git a/cleanrl/td3_continuous_action.py b/cleanrl/td3_continuous_action.py
@@ -166,7 +166,7 @@ def forward(self, x):
     for global_step in range(args.total_timesteps):
         # ALGO LOGIC: put action logic here
         if global_step < args.learning_starts:
-            actions = envs.action_space.sample()
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
         else:
             actions = actor(torch.Tensor(obs).to(device))
             actions = np.array(