HumanCompatibleAI · levmckinney · Aug 9, 2022 · Aug 17, 2022 · Aug 17, 2022 · Aug 17, 2022
diff --git a/setup.cfg b/setup.cfg
@@ -32,7 +32,7 @@ filterwarnings =
     ignore:Using or importing the ABCs from 'collections':DeprecationWarning:(google|pkg_resources)
     ignore:Parameters to load are deprecated:Warning:gym
     ignore:The binary mode of fromstring is deprecated:DeprecationWarning:gym
-
+addopts = --doctest-modules
 markers =
     expensive: mark a test as expensive (deselect with '-m "not expensive"')
 

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
@@ -3,6 +3,8 @@
 Trains a reward model and optionally a policy based on preferences
 between trajectory fragments.
 """
+from __future__ import generators
+
 import abc
 import math
 import pickle
@@ -210,18 +212,19 @@ def train(self, steps: int, **kwargs) -> None:
             RuntimeError: Transitions left in `self.buffering_wrapper`; call
                 `self.sample` first to clear them.
         """
-        n_transitions = self.buffering_wrapper.n_transitions
-        if n_transitions:
-            raise RuntimeError(
-                f"There are {n_transitions} transitions left in the buffer. "
-                "Call AgentTrainer.sample() first to clear them.",
+        with self.logger.accumulate_means("agent"):
+            n_transitions = self.buffering_wrapper.n_transitions
+            if n_transitions:
+                raise RuntimeError(
+                    f"There are {n_transitions} transitions left in the buffer. "
+                    "Call AgentTrainer.sample() first to clear them.",
+                )
+            self.algorithm.learn(
+                total_timesteps=steps,
+                reset_num_timesteps=False,
+                callback=self.log_callback,
+                **kwargs,
             )
-        self.algorithm.learn(
-            total_timesteps=steps,
-            reset_num_timesteps=False,
-            callback=self.log_callback,
-            **kwargs,
-        )
 
     def sample(self, steps: int) -> Sequence[types.TrajectoryWithRew]:
         agent_trajs, _ = self.buffering_wrapper.pop_finished_trajectories()
@@ -299,6 +302,99 @@ def logger(self, value: imit_logger.HierarchicalLogger):
         self.algorithm.set_logger(self.logger)
 
 
+class MixtureOfTrajectoryGenerators(TrajectoryGenerator):
+    """A collection of trajectory generators merged together."""
+
+    members: Sequence[TrajectoryGenerator]
+
+    def __init__(
+        self,
+        members: Sequence[TrajectoryGenerator],
+        custom_logger: Optional[imit_logger.HierarchicalLogger] = None,
+        share_training_steps: bool = True,
+    ):
+        """Create a mixture of trajectory generators.
+
+        Args:
+            members: Individual trajectory generators that will make up the ensemble.
+            custom_logger: Custom logger passed to super class.
+            share_training_steps: If True, training steps are split equally among the
+                trajectory generators. If False, all each trajectory generator train
+                for the full number of steps. Defaults to True.
+
+        Raises:
+            ValueError: if members is empty.
+        """
+        if len(members) < 2:
+            raise ValueError(
+                "MixtureOfTrajectoryGenerators requires at least two member!",
+            )
+        self.members = tuple(members)
+        super().__init__(custom_logger=custom_logger)
+        self.share_training_steps = share_training_steps
+
+    def _partition(self, steps: int) -> Sequence[int]:
+        """Partition steps into len(self.members) close to equal parts."""
+        n = len(self.members)
+        # Approximately evenly partition work.
+        d = steps // n
+        if d == 0:
+            raise ValueError(
+                f"Cannot partition only {steps} steps among {n} members!",
+            )
+        r = steps % n
+        partition = [d] * n
+        for i in range(r):
+            partition[i] += 1
+        return partition
+
+    def sample(self, steps: int) -> Sequence[TrajectoryWithRew]:
+        """Sample a batch of trajectories splitting evenly amongst the mixture members.
+
+        Args:
+            steps: All trajectories taken together should
+                have at least this many steps.
+
+        Returns:
+            A list of sampled trajectories with rewards (which should
+            be the environment rewards, not ones from a reward model).
+        """
+        trajectories = []
+        for s, generator in zip(self._partition(steps), self.members):
+            trajectories.extend(generator.sample(s))
+        return trajectories
+
+    def train(self, steps: int, **kwargs):
+        """Train each trajectory generator.
+
+        If self.share_training_steps is set to true, training steps are split equally
+        among the trajectory generators. Otherwise, all each trajectory generator trains
+        for the full number of steps.
+
+        Args:
+            steps: number of environment steps to train for.
+            **kwargs: additional keyword arguments to passed along to members.
+        """
+        if self.share_training_steps:
+            steps_to_train = self._partition(steps)
+        else:
+            steps_to_train = [steps]
+
+        for i, (generator, s) in enumerate(zip(self.members, steps_to_train)):
+            with self.logger.add_prefix(f"gen_{i}"):
+                generator.train(s, **kwargs)
+
+    @property
+    def logger(self) -> imit_logger.HierarchicalLogger:
+        return self._logger
+
+    @logger.setter
+    def logger(self, value: imit_logger.HierarchicalLogger):
+        self._logger = value
+        for generator in self.members:
+            generator.logger = value
+
+
 def _get_trajectories(
     trajectories: Sequence[TrajectoryWithRew],
     steps: int,
@@ -1523,9 +1619,8 @@ def train(
             # at the end of training (where the reward model is presumably best)
             if i == self.num_iterations - 1:
                 num_steps += extra_timesteps
-            with self.logger.accumulate_means("agent"):
-                self.logger.log(f"Training agent for {num_steps} timesteps")
-                self.trajectory_generator.train(steps=num_steps)
+            self.logger.log(f"Training agent for {num_steps} timesteps")
+            self.trajectory_generator.train(steps=num_steps)
 
             self.logger.dump(self._iteration)
 

diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -43,6 +43,7 @@ def train_defaults():
     save_preferences = False  # save preference dataset at the end?
     agent_path = None  # path to a (partially) trained agent to load at the beginning
     # type of PreferenceGatherer to use
+    num_agents = 1  # The number of agents to train the reward against.
     gatherer_cls = preference_comparisons.SyntheticGatherer
     # arguments passed on to the PreferenceGatherer specified by gatherer_cls
     gatherer_kwargs = {}
@@ -59,6 +60,7 @@ def train_defaults():
 
     checkpoint_interval = 0  # Num epochs between saving (<0 disables, =0 final only)
     query_schedule = "hyperbolic"
+    share_training_steps_among_agents = True
 
 
 @train_preference_comparisons_ex.named_config

diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
@@ -69,6 +69,7 @@ def train_preference_comparisons(
     trajectory_generator_kwargs: Mapping[str, Any],
     save_preferences: bool,
     agent_path: Optional[str],
+    num_agents: int,
     preference_model_kwargs: Mapping[str, Any],
     reward_trainer_kwargs: Mapping[str, Any],
     gatherer_cls: Type[preference_comparisons.PreferenceGatherer],
@@ -80,6 +81,7 @@ def train_preference_comparisons(
     allow_variable_horizon: bool,
     checkpoint_interval: int,
     query_schedule: Union[str, type_aliases.Schedule],
+    share_training_steps_among_agents: bool,
 ) -> Mapping[str, Any]:
     """Train a reward model using preference comparisons.
 
@@ -113,6 +115,7 @@ def train_preference_comparisons(
         save_preferences: if True, store the final dataset of preferences to disk.
         agent_path: if given, initialize the agent using this stored policy
             rather than randomly.
+        num_agents: number of agents to train the reward model against.
         preference_model_kwargs: passed to PreferenceModel
         reward_trainer_kwargs: passed to BasicRewardTrainer or EnsembleRewardTrainer
         gatherer_cls: type of PreferenceGatherer to use (defaults to SyntheticGatherer)
@@ -140,6 +143,9 @@ def train_preference_comparisons(
             be allocated to each iteration. "hyperbolic" and "inverse_quadratic"
             apportion fewer queries to later iterations when the policy is assumed
             to be better and more stable.
+        share_training_steps_among_agents: If True (default), when training with
+            num_agents > 1 training steps are split equally among the agents. If
+            False, all agents train for the full number of steps.
 
     Returns:
         Rollout statistics from trained policy.
@@ -155,31 +161,55 @@ def train_preference_comparisons(
             reward_net.predict_processed,
             update_stats=False,
         )
-        if agent_path is None:
-            agent = rl_common.make_rl_algo(venv, relabel_reward_fn=relabel_reward_fn)
-        else:
-            agent = rl_common.load_rl_algo_from_path(
-                agent_path=agent_path,
-                venv=venv,
-                relabel_reward_fn=relabel_reward_fn,
-            )
 
-        if trajectory_path is None:
-            # Setting the logger here is not necessary (PreferenceComparisons takes care
-            # of it automatically) but it avoids creating unnecessary loggers.
-            trajectory_generator = preference_comparisons.AgentTrainer(
+        if num_agents < 1 or not isinstance(num_agents, int):
+            raise ValueError("num_agents must be a positive integer!")
+
+        def make_agent_trainer(seed: Optional[int] = None):
+            if agent_path is None:
+                agent = rl_common.make_rl_algo(
+                    venv,
+                    relabel_reward_fn=relabel_reward_fn,
+                )
+            else:
+                agent = rl_common.load_rl_algo_from_path(
+                    agent_path=agent_path,
+                    venv=venv,
+                    relabel_reward_fn=relabel_reward_fn,
+                )
+
+            # Setting the logger here is not really necessary (PreferenceComparisons
+            # takes care of that automatically) but it avoids creating unnecessary
+            # loggers
+            return preference_comparisons.AgentTrainer(
                 algorithm=agent,
                 reward_fn=reward_net,
                 venv=venv,
                 exploration_frac=exploration_frac,
-                seed=_seed,
+                seed=_seed if seed is None else seed,
                 custom_logger=custom_logger,
                 **trajectory_generator_kwargs,
             )
+
+        if trajectory_path is None and num_agents == 1:
+            single_agent = True
+            trajectory_generator = make_agent_trainer()
             # Stable Baselines will automatically occupy GPU 0 if it is available.
             # Let's use the same device as the SB3 agent for the reward model.
             reward_net = reward_net.to(trajectory_generator.algorithm.device)
+        elif trajectory_path is None and num_agents > 1:
+            single_agent = False
+            members = [make_agent_trainer(_seed + i) for i in range(num_agents)]
+            trajectory_generator = preference_comparisons.MixtureOfTrajectoryGenerators(
+                members=members,
+                custom_logger=custom_logger,
+                share_training_steps=share_training_steps_among_agents,
+            )
+            # Again using the same device as the SB3 agent
+            reward_net = reward_net.to(members[0].algorithm.device)
         else:
+            single_agent = False
+
             if exploration_frac > 0:
                 raise ValueError(
                     "exploration_frac can't be set when a trajectory dataset is used",
@@ -251,7 +281,7 @@ def save_callback(iteration_num):
                         "checkpoints",
                         f"{iteration_num:04d}",
                     ),
-                    allow_save_policy=bool(trajectory_path is None),
+                    allow_save_policy=single_agent,
                 )
 
         results = main_trainer.train(
@@ -261,9 +291,9 @@ def save_callback(iteration_num):
         )
 
         # Storing and evaluating policy only useful if we generated trajectory data
-        if bool(trajectory_path is None):
+        if trajectory_path is None and single_agent:
             results = dict(results)
-            results["rollout"] = train.eval_policy(agent, venv)
+            results["rollout"] = train.eval_policy(trajectory_generator.algorithm, venv)
 
     if save_preferences:
         main_trainer.dataset.save(os.path.join(log_dir, "preferences.pkl"))
@@ -273,7 +303,7 @@ def save_callback(iteration_num):
         save_checkpoint(
             trainer=main_trainer,
             save_path=os.path.join(log_dir, "checkpoints", "final"),
-            allow_save_policy=bool(trajectory_path is None),
+            allow_save_policy=single_agent,
         )
 
     return results