Update trainer tests; don't duplicate trainer setup, and also test some more combos via pytest parametrize

kavanase · kavanase · commit 27dcae02340d · 2024-07-02T14:27:19.000-04:00
diff --git a/tests/unit/trainer/test_trainer.py b/tests/unit/trainer/test_trainer.py
@@ -44,36 +44,16 @@ def dummy_builder():
     early_stopping_lower_bounds={"LR": 1e-10},
     model_builders=[dummy_builder],
 )
-N_TRAIN_PERCENT = "75%"
-N_VAL_PERCENT = "15%"
-N_TRAIN_PERCENT_100 = "70%"
-N_VAL_PERCENT_100 = "30%"
 
 
-@pytest.fixture(scope="function")
-def trainer(float_tolerance):
-    """
-    Generate a class instance with minimal configurations
-    """
-    conf = minimal_config.copy()
-    conf["default_dtype"] = str(torch.get_default_dtype())[len("torch.") :]
-    model = model_from_config(conf)
-    with tempfile.TemporaryDirectory(prefix="output") as path:
-        conf["root"] = path
-        c = Trainer(model=model, **conf)
-        yield c
-
-
-@pytest.fixture(scope="function")
-def trainer_w_percent_n_train_n_val(float_tolerance):
+def create_trainer(float_tolerance, **kwargs):
     """
     Generate a class instance with minimal configurations,
-    where n_train and n_val are given as percentage of the
-    dataset size.
+    with the option to modify the configurations using
+    kwargs.
     """
     conf = minimal_config.copy()
-    conf["n_train"] = N_TRAIN_PERCENT
-    conf["n_val"] = N_VAL_PERCENT  # note that summed percentages don't have to be 100%
+    conf.update(kwargs)
     conf["default_dtype"] = str(torch.get_default_dtype())[len("torch.") :]
     model = model_from_config(conf)
     with tempfile.TemporaryDirectory(prefix="output") as path:
@@ -83,24 +63,11 @@ def trainer_w_percent_n_train_n_val(float_tolerance):
 
 
 @pytest.fixture(scope="function")
-def trainer_w_percent_n_train_n_val_flooring(float_tolerance):
+def trainer(float_tolerance):
     """
-    Generate a class instance with minimal configurations,
-    where n_train and n_val are given as percentage of the
-    dataset size, summing to 100% but with a split that gives
-    non-integer numbers of frames for n_train and n_val.
-    (i.e. n_train = 70% = 5.6 frames, n_val = 30% = 2.4 frames,
-    so final n_train is 6 and n_val is 2)
+    Generate a class instance with minimal configurations.
     """
-    conf = minimal_config.copy()
-    conf["n_train"] = N_TRAIN_PERCENT_100
-    conf["n_val"] = N_VAL_PERCENT_100
-    conf["default_dtype"] = str(torch.get_default_dtype())[len("torch.") :]
-    model = model_from_config(conf)
-    with tempfile.TemporaryDirectory(prefix="output") as path:
-        conf["root"] = path
-        c = Trainer(model=model, **conf)
-        yield c
+    yield from create_trainer(float_tolerance)
 
 
 class TestTrainerSetUp:
@@ -203,11 +170,25 @@ def test_split(self, trainer, nequip_dataset, mode):
                 assert n_samples == trainer.n_train
 
     @pytest.mark.parametrize("mode", ["random", "sequential"])
+    @pytest.mark.parametrize(
+        "n_train_percent, n_val_percent", [("75%", "15%"), ("20%", "30%")]
+    )
     def test_split_w_percent_n_train_n_val(
-        self, trainer_w_percent_n_train_n_val, nequip_dataset, mode
+        self, nequip_dataset, mode, float_tolerance, n_train_percent, n_val_percent
     ):
+        """
+        Test case where n_train and n_val are given as percentage of the
+        dataset size, and here they don't sum to 100%.
+        """
         # nequip_dataset has 8 frames, so setting n_train to 75% and n_val to 15% should give 6 and 1
-        # frames respectively
+        # frames respectively. Note that summed percentages don't have to be 100%
+        trainer_w_percent_n_train_n_val = next(
+            create_trainer(
+                float_tolerance=float_tolerance,
+                n_train=n_train_percent,
+                n_val=n_val_percent,
+            )
+        )
         trainer_w_percent_n_train_n_val.train_val_split = mode
         trainer_w_percent_n_train_n_val.set_dataset(nequip_dataset)
         for epoch_i in range(3):
@@ -222,29 +203,46 @@ def test_split_w_percent_n_train_n_val(
                 assert (
                     n_samples != trainer_w_percent_n_train_n_val.n_train
                 )  # n_train now a percentage
-                assert trainer_w_percent_n_train_n_val.n_train == N_TRAIN_PERCENT  # 75%
+                assert trainer_w_percent_n_train_n_val.n_train == n_train_percent  # 75%
                 assert n_samples == int(
-                    (float(N_TRAIN_PERCENT.strip("%")) / 100) * len(nequip_dataset)
+                    (float(n_train_percent.strip("%")) / 100) * len(nequip_dataset)
                 )  # 6
-                assert trainer_w_percent_n_train_n_val.n_val == N_VAL_PERCENT  # 15%
+                assert trainer_w_percent_n_train_n_val.n_val == n_val_percent  # 15%
 
             for i, batch in enumerate(trainer_w_percent_n_train_n_val.dl_val):
                 n_val_samples += batch[AtomicDataDict.BATCH_PTR_KEY].shape[0] - 1
 
             assert (
                 n_val_samples != trainer_w_percent_n_train_n_val.n_val
             )  # n_val now a percentage
-            assert trainer_w_percent_n_train_n_val.n_val == N_VAL_PERCENT  # 15%
+            assert trainer_w_percent_n_train_n_val.n_val == n_val_percent  # 15%
             assert n_val_samples == int(
-                (float(N_VAL_PERCENT.strip("%")) / 100) * len(nequip_dataset)
+                (float(n_val_percent.strip("%")) / 100) * len(nequip_dataset)
             )  # 1 (floored)
 
     @pytest.mark.parametrize("mode", ["random", "sequential"])
+    @pytest.mark.parametrize(
+        "n_train_percent, n_val_percent", [("70%", "30%"), ("55%", "45%")]
+    )
     def test_split_w_percent_n_train_n_val_flooring(
-        self, trainer_w_percent_n_train_n_val_flooring, nequip_dataset, mode
+        self, nequip_dataset, mode, float_tolerance, n_train_percent, n_val_percent
     ):
+        """
+        Test case where n_train and n_val are given as percentage of the
+        dataset size, summing to 100% but with a split that gives
+        non-integer numbers of frames for n_train and n_val.
+        (i.e. n_train = 70% = 5.6 frames, n_val = 30% = 2.4 frames,
+        so final n_train is 6 and n_val is 2)
+        """
         # nequip_dataset has 8 frames, so n_train = 70% = 5.6 frames, n_val = 30% = 2.4 frames,
         # so final n_train is 6 and n_val is 2
+        trainer_w_percent_n_train_n_val_flooring = next(
+            create_trainer(
+                float_tolerance=float_tolerance,
+                n_train=n_train_percent,
+                n_val=n_val_percent,
+            )
+        )
         trainer_w_percent_n_train_n_val_flooring.train_val_split = mode
         trainer_w_percent_n_train_n_val_flooring.set_dataset(nequip_dataset)
         for epoch_i in range(3):
@@ -267,23 +265,21 @@ def test_split_w_percent_n_train_n_val_flooring(
                     n_samples != trainer_w_percent_n_train_n_val_flooring.n_train
                 )  # n_train now a percentage
                 assert (
-                    trainer_w_percent_n_train_n_val_flooring.n_train
-                    == N_TRAIN_PERCENT_100
+                    trainer_w_percent_n_train_n_val_flooring.n_train == n_train_percent
                 )  # 70%
                 # _not_ equal to the bare floored value now:
                 assert n_samples != int(
-                    (float(N_TRAIN_PERCENT_100.strip("%")) / 100) * len(nequip_dataset)
+                    (float(n_train_percent.strip("%")) / 100) * len(nequip_dataset)
                 )  # 5
                 assert (
                     n_samples
                     == int(  # equal to floored value plus 1
-                        (float(N_TRAIN_PERCENT_100.strip("%")) / 100)
-                        * len(nequip_dataset)
+                        (float(n_train_percent.strip("%")) / 100) * len(nequip_dataset)
                     )
                     + 1
                 )  # 6
                 assert (
-                    trainer_w_percent_n_train_n_val_flooring.n_val == N_VAL_PERCENT_100
+                    trainer_w_percent_n_train_n_val_flooring.n_val == n_val_percent
                 )  # 30%
 
             for i, batch in enumerate(trainer_w_percent_n_train_n_val_flooring.dl_val):
@@ -293,10 +289,10 @@ def test_split_w_percent_n_train_n_val_flooring(
                 n_val_samples != trainer_w_percent_n_train_n_val_flooring.n_val
             )  # n_val now a percentage
             assert (
-                trainer_w_percent_n_train_n_val_flooring.n_val == N_VAL_PERCENT_100
+                trainer_w_percent_n_train_n_val_flooring.n_val == n_val_percent
             )  # 30%
             assert n_val_samples == int(
-                (float(N_VAL_PERCENT_100.strip("%")) / 100) * len(nequip_dataset)
+                (float(n_val_percent.strip("%")) / 100) * len(nequip_dataset)
             )  # 2 (floored)
 
             assert n_samples + n_val_samples == len(nequip_dataset)  # 100% coverage