Change cov filling

paulf81 · paulf81 · commit 85f0e04dd82e · 2024-12-03T13:59:33.000-07:00
diff --git a/flasc/analysis/expected_power_analysis.py b/flasc/analysis/expected_power_analysis.py
@@ -18,8 +18,8 @@
     _fill_cov_with_var,
     _null_and_sync_covariance,
     _set_cov_to_zero,
-    _synchronize_cov_nulls_back_to_mean,
     _synchronize_nulls,
+    _synchronize_var_nulls_back_to_mean,
 )
 from flasc.data_processing.dataframe_manipulations import df_reduce_precision
 from flasc.logging_manager import LoggingManager
@@ -134,11 +134,6 @@ def _total_uplift_expected_power_single(
             / df_sum.filter(pl.col("df_name") == uplift_pair[0])["weighted_power"].to_numpy()[0]
         )
 
-    # with pl.Config(tbl_cols=-1):
-    #     print(df_bin)
-    #     print(df_sum)
-    #     print(uplift_results)
-
     return df_bin, df_sum, uplift_results
 
 
@@ -259,6 +254,7 @@ def _total_uplift_expected_power_with_standard_error(
     percentiles: List[float] = [2.5, 97.5],
     remove_any_null_turbine_bins: bool = False,
     set_cov_to_zero_or_var: str = "zero",
+    use_cov_when_available: bool = False,
     # variance_only: bool = False,
     # fill_cov_with_var: bool = False,
 ) -> Dict[str, Dict[str, float]]:
@@ -289,6 +285,8 @@ def _total_uplift_expected_power_with_standard_error(
             and of the test turbines is null.  Defaults to False.
         set_cov_to_zero_or_var (str): Set the covariance to zero or product of variances.
             Can be "zero" or "var". Defaults to "zero".
+        use_cov_when_available (bool): Use the covariance terms when available. If True,
+            set_cov_to_zero_or_var must be 'var'.  Defaults to False.
 
 
     Returns:
@@ -311,30 +309,38 @@ def _total_uplift_expected_power_with_standard_error(
         ws_max=ws_max,
     )
 
-    with pl.Config(tbl_cols=-1):
-        print(df_)
+    # with pl.Config(tbl_cols=-1):
+    #     print(df_)
 
     # Compute the covariance frame
     df_cov = _compute_covariance(
         df_, test_cols=test_cols, bin_cols_with_df_name=bin_cols_with_df_name
     )
 
-    with pl.Config(tbl_cols=-1):
-        print(df_cov)
+    # with pl.Config(tbl_cols=-1):
+    #     print(df_cov)
 
     # In current version of code, covarariances are either set to 0 or set to
     # product of variances
     if set_cov_to_zero_or_var == "zero":
-        df_cov = _set_cov_to_zero(df_cov, test_cols=test_cols)
+        if use_cov_when_available:
+            raise ValueError(
+                "use_cov_when_available cannot be True when set_cov_to_zero_or_var is 'zero'"
+            )
+        else:
+            df_cov = _set_cov_to_zero(df_cov, test_cols=test_cols)
     elif set_cov_to_zero_or_var == "var":
-        df_cov = _fill_cov_with_var(df_cov, test_cols=test_cols, fill_all=True)
+        if use_cov_when_available:
+            df_cov = _fill_cov_with_var(df_cov, test_cols=test_cols, fill_all=False)
+        else:
+            df_cov = _fill_cov_with_var(df_cov, test_cols=test_cols, fill_all=True)
     else:
         raise ValueError(
             f"set_cov_to_zero_or_var must be 'zero' or 'var', not {set_cov_to_zero_or_var}"
         )
 
-    with pl.Config(tbl_cols=-1):
-        print(df_cov)
+    # with pl.Config(tbl_cols=-1):
+    #     print(df_cov)
 
     # # If filling missing covariance terms, do it now
     # if fill_cov_with_var:
@@ -352,8 +358,8 @@ def _total_uplift_expected_power_with_standard_error(
         bin_cols_without_df_name=bin_cols_without_df_name,
     )
 
-    with pl.Config(tbl_cols=-1):
-        print(df_cov)
+    # with pl.Config(tbl_cols=-1):
+    #     print(df_cov)
 
     # Bin and group the dataframe
     df_bin = _bin_and_group_dataframe_expected_power(
@@ -362,14 +368,17 @@ def _total_uplift_expected_power_with_standard_error(
         bin_cols_without_df_name=bin_cols_without_df_name,
     )
 
-    with pl.Config(tbl_cols=-1):
-        print(df_bin)
+    # with pl.Config(tbl_cols=-1):
+    #     print(df_bin)
 
     # Join the covariance dataframe to df_bin
     df_bin = df_bin.join(df_cov, on=bin_cols_with_df_name, how="left")
 
-    # Synchronize any null values in the covariance back to df_bin
-    df_bin = _synchronize_cov_nulls_back_to_mean(df_bin=df_bin, test_cols=test_cols)
+    # DROPPING THIS AS REDUNDANT TO COPYING BACK NULL VARIANCE TO MEAN
+    # # Synchronize any null values in the covariance back to df_bin
+    # df_bin = _synchronize_cov_nulls_back_to_mean(df_bin=df_bin, test_cols=test_cols)
+
+    df_bin = _synchronize_var_nulls_back_to_mean(df_bin=df_bin, test_cols=test_cols)
 
     with pl.Config(tbl_cols=-1):
         print(df_bin)
diff --git a/flasc/analysis/expected_power_analysis_utilities.py b/flasc/analysis/expected_power_analysis_utilities.py
@@ -405,14 +405,15 @@ def _set_cov_to_zero(
     return df_cov
 
 
-def _synchronize_cov_nulls_back_to_mean(
+def _synchronize_var_nulls_back_to_mean(
     df_bin: pl.DataFrame,
     test_cols: List[str],
 ) -> pl.DataFrame:
-    """For each row, for any turbine with a null var or cov, null mean power.
+    """For each row, for any turbine with a null var, null mean power.
 
-    For each row, if there are any turbines in df_cov with undefined variances or covariances
-      (because count < 2), then the mean power for those turbines would get set to Null as well.
+    For each row, if there are any turbines with undefined variances
+      (because count < 2), then the mean power for
+      those turbines would get set to Null as well.
 
     Args:
         df_bin (pl.DataFrame): A polars dataframe with the mean and variance of the test
@@ -423,25 +424,63 @@ def _synchronize_cov_nulls_back_to_mean(
         pl.DataFrame: Update df_bin dataframe
     """
     n_test_cols = len(test_cols)
-    all_cov_cols = [f"cov_{t1}_{t2}" for t1, t2 in product(test_cols, test_cols)]
+    # all_cov_cols = [f"cov_{t1}_{t2}" for t1, t2 in product(test_cols, test_cols)]
 
     # Loop over all combinations of test columns for the mean column
     for t1_idx in range(n_test_cols):
         t1 = test_cols[t1_idx]
         t1_mean_col = f"{t1}_mean"
-
-        # Get a list of cov_cols that include the present turbine
-        cov_cols = [c for c in all_cov_cols if t1 in c]
-
-        # Get a mask for the rows where any of the cov_cols are null
-        # using the horizontal or operator
-        mask = df_bin.select(
-            pl.any_horizontal([pl.col(c).is_null() for c in cov_cols]).alias("mask")
-        )
+        t1_var_col = f"cov_{t1}_{t1}"
 
         # Set the mean power to null for the rows where the mask is true
         df_bin = df_bin.with_columns(
-            pl.when(mask["mask"]).then(None).otherwise(pl.col(t1_mean_col)).alias(t1_mean_col)
+            pl.when(pl.col(t1_var_col).is_null())
+            .then(None)
+            .otherwise(pl.col(t1_mean_col))
+            .alias(t1_mean_col)
         )
 
     return df_bin
+
+
+# REMOVING THIS FUNCTION AS TOO STRICT
+# def _synchronize_cov_nulls_back_to_mean(
+#     df_bin: pl.DataFrame,
+#     test_cols: List[str],
+# ) -> pl.DataFrame:
+#     """For each row, for any turbine with a null var or cov, null mean power.
+
+#     For each row, if there are any turbines in df_cov with undefined variances or covariances
+#       (because count < 2), then the mean power for those turbines would get set to Null as well.
+
+#     Args:
+#         df_bin (pl.DataFrame): A polars dataframe with the mean and variance of the test
+#             columns grouped by bin columns.
+#         test_cols (List[str]): A list of column names to calculate the covariance of
+
+#     Returns:
+#         pl.DataFrame: Update df_bin dataframe
+#     """
+#     n_test_cols = len(test_cols)
+#     all_cov_cols = [f"cov_{t1}_{t2}" for t1, t2 in product(test_cols, test_cols)]
+
+#     # Loop over all combinations of test columns for the mean column
+#     for t1_idx in range(n_test_cols):
+#         t1 = test_cols[t1_idx]
+#         t1_mean_col = f"{t1}_mean"
+
+#         # Get a list of cov_cols that include the present turbine
+#         cov_cols = [c for c in all_cov_cols if t1 in c]
+
+#         # Get a mask for the rows where any of the cov_cols are null
+#         # using the horizontal or operator
+#         mask = df_bin.select(
+#             pl.any_horizontal([pl.col(c).is_null() for c in cov_cols]).alias("mask")
+#         )
+
+#         # Set the mean power to null for the rows where the mask is true
+#         df_bin = df_bin.with_columns(
+#             pl.when(mask["mask"]).then(None).otherwise(pl.col(t1_mean_col)).alias(t1_mean_col)
+#         )
+
+#     return df_bin
diff --git a/tests/expected_power_analysis_test.py b/tests/expected_power_analysis_test.py
@@ -18,8 +18,9 @@
     _get_num_points_pair,
     _null_and_sync_covariance,
     _set_cov_to_zero,
-    _synchronize_cov_nulls_back_to_mean,
     _synchronize_nulls,
+    # _synchronize_cov_nulls_back_to_mean,
+    _synchronize_var_nulls_back_to_mean,
 )
 
 
@@ -555,7 +556,7 @@ def test_set_cov_to_zero():
     assert_frame_equal(zero_cov_df, expected_df, check_row_order=False, check_dtypes=False)
 
 
-def test_synchronize_cov_nulls_back_to_mean():
+def test__synchronize_var_nulls_back_to_mean():
     test_df_bin = pl.DataFrame(
         {
             "wd_bin": [0, 0, 1, 1],
@@ -583,8 +584,8 @@ def test_synchronize_cov_nulls_back_to_mean():
             "wd_bin": [0, 0, 1, 1],
             "ws_bin": [0, 1, 0, 1],
             "df_name": ["baseline", "baseline", "baseline", "baseline"],
-            "pow_000_mean": [1, 2, None, None],
-            "pow_001_mean": [5, 6, None, 8],
+            "pow_000_mean": [1, 2, 3, None],
+            "pow_001_mean": [5, 6, 7, 8],
         }
     )
 
@@ -594,14 +595,61 @@ def test_synchronize_cov_nulls_back_to_mean():
         test_df_cov, on=["wd_bin", "ws_bin", "df_name"], how="left"
     )
 
-    df_res = _synchronize_cov_nulls_back_to_mean(
+    df_res = _synchronize_var_nulls_back_to_mean(
         df_bin=test_df_bin,
         test_cols=["pow_000", "pow_001"],
     )
 
     assert_frame_equal(df_res, expected_df_bin, check_row_order=False, check_dtypes=False)
 
 
+# def test_synchronize_cov_nulls_back_to_mean():
+#     test_df_bin = pl.DataFrame(
+#         {
+#             "wd_bin": [0, 0, 1, 1],
+#             "ws_bin": [0, 1, 0, 1],
+#             "df_name": ["baseline", "baseline", "baseline", "baseline"],
+#             "pow_000_mean": [1, 2, 3, 4],
+#             "pow_001_mean": [5, 6, 7, 8],
+#         }
+#     )
+
+#     test_df_cov = pl.DataFrame(
+#         {
+#             "wd_bin": [0, 0, 1, 1],
+#             "ws_bin": [0, 1, 0, 1],
+#             "df_name": ["baseline", "baseline", "baseline", "baseline"],
+#             "cov_pow_000_pow_000": [1, 2, 3, None],
+#             "cov_pow_000_pow_001": [5, 6, None, 8],
+#             "cov_pow_001_pow_000": [9, 10, 11, 12],
+#             "cov_pow_001_pow_001": [13, 14, 15, 16],
+#         }
+#     )
+
+#     expected_df_bin = pl.DataFrame(
+#         {
+#             "wd_bin": [0, 0, 1, 1],
+#             "ws_bin": [0, 1, 0, 1],
+#             "df_name": ["baseline", "baseline", "baseline", "baseline"],
+#             "pow_000_mean": [1, 2, None, None],
+#             "pow_001_mean": [5, 6, None, 8],
+#         }
+#     )
+
+#     # Join the covariance dataframe to df_bin
+#     test_df_bin = test_df_bin.join(test_df_cov, on=["wd_bin", "ws_bin", "df_name"], how="left")
+#     expected_df_bin = expected_df_bin.join(
+#         test_df_cov, on=["wd_bin", "ws_bin", "df_name"], how="left"
+#     )
+
+#     df_res = _synchronize_cov_nulls_back_to_mean(
+#         df_bin=test_df_bin,
+#         test_cols=["pow_000", "pow_001"],
+#     )
+
+#     assert_frame_equal(df_res, expected_df_bin, check_row_order=False, check_dtypes=False)
+
+
 def test_total_uplift_expected_power_with_standard_error():
     a_in = load_repeated_data()