release 0.4.38

samplics-org · Feb 18, 2025 · a765a7a · a765a7a
1 parent 6ae973b
commit a765a7a
Show file tree

Hide file tree

Showing 5 changed files with 159 additions and 45 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "samplics"
-version = "0.4.37"
+version = "0.4.38"
 description = "Select, weight and analyze complex sample data"
 
 authors = [{ name = "Mamadou S Diallo", email = "msdiallo@samplics.org" }]

diff --git a/src/samplics/__init__.py b/src/samplics/__init__.py
@@ -114,4 +114,4 @@
     "SinglePSUError",
 ]
 
-__version__ = "0.4.37"
+__version__ = "0.4.38"
diff --git a/src/samplics/categorical/tabulation.py b/src/samplics/categorical/tabulation.py
@@ -7,6 +7,7 @@
 from __future__ import annotations
 
 import itertools
+import warnings
 
 from typing import Optional, Union
 
@@ -58,7 +59,9 @@ def __str__(self) -> str:
             tbl_subhead1 = f" Number of strata: {self.design_info['nb_strata']}"
             tbl_subhead2 = f" Number of PSUs: {self.design_info['nb_psus']}"
             tbl_subhead3 = f" Number of observations: {self.design_info['nb_obs']}"
-            tbl_subhead4 = f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}"
+            tbl_subhead4 = (
+                f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}"
+            )
 
             return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead3}\n{tbl_subhead4}\n\n {self.to_dataframe().to_string(index=False)}\n"
 
@@ -73,7 +76,9 @@ def _estimate(
         fpc: Union[dict, float] = 1,
         deff: bool = False,
         coef_var: bool = False,
-        single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error,
+        single_psu: Union[
+            SinglePSUEst, dict[StringNumber, SinglePSUEst]
+        ] = SinglePSUEst.error,
         strata_comb: Optional[dict[Array, Array]] = None,
         remove_nan: bool = False,
     ) -> tuple[TaylorEstimator, list, int]:
@@ -87,10 +92,14 @@ def _estimate(
                 ssu,
             )
             if var.ndim == 1:  # Series
-                to_keep = to_keep & remove_nans(var.values.ravel().shape[0], var.values.ravel())
+                to_keep = to_keep & remove_nans(
+                    var.values.ravel().shape[0], var.values.ravel()
+                )
             elif var.ndim == 2:  # DataFrame
                 for col in var.columns:
-                    to_keep = to_keep & remove_nans(var.values.ravel().shape[0], var[col].values.ravel())
+                    to_keep = to_keep & remove_nans(
+                        var.values.ravel().shape[0], var[col].values.ravel()
+                    )
             else:
                 raise DimensionError("The dimension must be 1 or 2.")
 
@@ -153,7 +162,9 @@ def tabulate(
         fpc: Union[dict, float] = 1,
         deff: bool = False,
         coef_var: bool = False,
-        single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error,
+        single_psu: Union[
+            SinglePSUEst, dict[StringNumber, SinglePSUEst]
+        ] = SinglePSUEst.error,
         strata_comb: Optional[dict[Array, Array]] = None,
         remove_nan: bool = False,
     ) -> None:
@@ -179,12 +190,22 @@ def tabulate(
         vars_names = set_variables_names(vars, varnames, prefix)
 
         if len(vars_names) != nb_vars:
-            raise AssertionError("Length of varnames must be the same as the number of columns of vars")
+            raise AssertionError(
+                "Length of varnames must be the same as the number of columns of vars"
+            )
 
         _samp_weight = numpy_array(samp_weight)
 
-        _samp_weight = np.ones(vars_df.shape[0]) if _samp_weight.shape in ((), (0,)) else _samp_weight
-        _samp_weight = np.repeat(_samp_weight, vars_df.shape[0]) if _samp_weight.shape[0] == 1 else _samp_weight
+        _samp_weight = (
+            np.ones(vars_df.shape[0])
+            if _samp_weight.shape in ((), (0,))
+            else _samp_weight
+        )
+        _samp_weight = (
+            np.repeat(_samp_weight, vars_df.shape[0])
+            if _samp_weight.shape[0] == 1
+            else _samp_weight
+        )
         _stratum = numpy_array(stratum)
         _psu = numpy_array(psu)
         _ssu = numpy_array(ssu)
@@ -272,7 +293,9 @@ def to_dataframe(
         oneway_df = pd.DataFrame([])
 
         for var in self.vars_names:
-            var_df = pd.DataFrame(np.repeat(var, len(self.vars_levels[var])), columns=["variable"])
+            var_df = pd.DataFrame(
+                np.repeat(var, len(self.vars_levels[var])), columns=["variable"]
+            )
             var_df["category"] = self.vars_levels[var]
             var_df[self.param] = list(self.point_est[var].values())
             var_df["stderror"] = list(self.stderror[var].values())
@@ -331,11 +354,15 @@ def __str__(self) -> str:
         if self.vars_names == []:
             return "No categorical variables to tabulate"
         else:
-            tbl_head = f"Cross-tabulation of {self.vars_names[0]} and {self.vars_names[1]}"
+            tbl_head = (
+                f"Cross-tabulation of {self.vars_names[0]} and {self.vars_names[1]}"
+            )
             tbl_subhead1 = f" Number of strata: {self.design_info['nb_strata']}"
             tbl_subhead2 = f" Number of PSUs: {self.design_info['nb_psus']}"
             tbl_subhead3 = f" Number of observations: {self.design_info['nb_obs']}"
-            tbl_subhead4 = f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}"
+            tbl_subhead4 = (
+                f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}"
+            )
 
             chisq_dist = f"chi2({self.stats['Pearson-Unadj']['df']})"
             f_dist = f"F({self.stats['Pearson-Adj']['df_num']:.2f}, {self.stats['Pearson-Adj']['df_den']:.2f}"
@@ -351,7 +378,9 @@ def __str__(self) -> str:
             return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead3}\n{tbl_subhead4}\n\n {self.to_dataframe().to_string(index=False)}\n\n{pearson_test}\n\n {lr_test}\n"
 
     # also mutates tbl_est
-    def _extract_estimates(self, tbl_est, vars_levels) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    def _extract_estimates(
+        self, tbl_est, vars_levels
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         levels = list(tbl_est.point_est.keys())
         missing_levels = vars_levels[~np.isin(vars_levels, levels)]
         if missing_levels.shape[0] > 0:
@@ -367,7 +396,9 @@ def _extract_estimates(self, tbl_est, vars_levels) -> tuple[np.ndarray, np.ndarr
                 else:
                     for ll in missing_levels:
                         tbl_est.covariance[level][ll] = 0.0
-                    tbl_est.covariance[level] = dict(sorted(tbl_est.covariance[level].items()))
+                    tbl_est.covariance[level] = dict(
+                        sorted(tbl_est.covariance[level].items())
+                    )
 
         _tbl_est_point_est = dict(sorted(tbl_est.point_est.items()))
         _tbl_est_covariance = dict(sorted(tbl_est.covariance.items()))
@@ -389,7 +420,9 @@ def tabulate(
         fpc: Union[dict, float] = 1,
         deff: bool = False,
         coef_var: bool = False,
-        single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error,
+        single_psu: Union[
+            SinglePSUEst, dict[StringNumber, SinglePSUEst]
+        ] = SinglePSUEst.error,
         strata_comb: Optional[dict[Array, Array]] = None,
         remove_nan: bool = False,
     ) -> None:
@@ -428,8 +461,16 @@ def tabulate(
 
         df = vars.with_columns(
             samp_weight=numpy_array(samp_weight),
-            stratum=(np.repeat("__none__", vars.shape[0]) if stratum is None else numpy_array(stratum)),
-            psu=(np.linspace(1, vars.shape[0], num=vars.shape[0]) if psu is None else numpy_array(psu)),
+            stratum=(
+                np.repeat("__none__", vars.shape[0])
+                if stratum is None
+                else numpy_array(stratum)
+            ),
+            psu=(
+                np.linspace(1, vars.shape[0], num=vars.shape[0])
+                if psu is None
+                else numpy_array(psu)
+            ),
             ssu=np.repeat(1, vars.shape[0]) if ssu is None else numpy_array(ssu),
         ).filter(pl.col("samp_weight") > 0)
 
@@ -447,9 +488,18 @@ def tabulate(
         if remove_nan:
             df = (
                 df.filter(
-                    (pl.col(vars_names[0]).is_not_null() & ~pl.col(vars_names[0]).eq("NaN"))
-                    & (pl.col(vars_names[1]).is_not_null() & ~pl.col(vars_names[1]).eq("NaN"))
-                    & (pl.col("samp_weight").is_not_null() & pl.col("samp_weight").is_not_nan())
+                    (
+                        pl.col(vars_names[0]).is_not_null()
+                        & ~pl.col(vars_names[0]).eq("NaN")
+                    )
+                    & (
+                        pl.col(vars_names[1]).is_not_null()
+                        & ~pl.col(vars_names[1]).eq("NaN")
+                    )
+                    & (
+                        pl.col("samp_weight").is_not_null()
+                        & pl.col("samp_weight").is_not_nan()
+                    )
                     & (pl.col("stratum").is_not_null() & ~pl.col("stratum").eq("NaN"))
                     & (pl.col("psu").is_not_null() & ~pl.col("psu").eq("NaN"))
                     & (pl.col("ssu").is_not_null() & ~pl.col("ssu").eq("NaN"))
@@ -482,9 +532,11 @@ def tabulate(
         if len(df.shape) == 2:
             vars_for_oneway = (
                 df.select(vars_names)
-                .with_columns((pl.col(vars_names[0]) + "__by__" + pl.col(vars_names[1])).alias("__cross_vars__"))[
-                    "__cross_vars__"
-                ]
+                .with_columns(
+                    (pl.col(vars_names[0]) + "__by__" + pl.col(vars_names[1])).alias(
+                        "__cross_vars__"
+                    )
+                )["__cross_vars__"]
                 .to_numpy()
             )
         else:
@@ -530,7 +582,9 @@ def tabulate(
             raise ValueError("parameter must be 'count' or 'proportion'")
 
         cov_est_srs = (
-            np.diag(cell_est) - cell_est.reshape((cell_est.shape[0], 1)) @ cell_est.reshape((1, cell_est.shape[0]))
+            np.diag(cell_est)
+            - cell_est.reshape((cell_est.shape[0], 1))
+            @ cell_est.reshape((1, cell_est.shape[0]))
         ) / df.shape[0]
         # cov_est_srs = cov_est_srs * ((df.shape[0] - 1) / df.shape[0])
 
@@ -558,11 +612,34 @@ def tabulate(
 
         try:
             x1_t = np.transpose(x1)
-            x2_tilde = x2 - x1 @ np.linalg.inv(x1_t @ cov_est_srs @ x1) @ (x1_t @ cov_est_srs @ x2)
-            delta_est = np.linalg.inv(np.transpose(x2_tilde) @ cov_est_srs @ x2_tilde) @ (
-                np.transpose(x2_tilde) @ cov_est @ x2_tilde
+            x2_tilde = x2 - x1 @ np.linalg.inv(x1_t @ cov_est_srs @ x1) @ (
+                x1_t @ cov_est_srs @ x2
             )
+            delta_est = np.linalg.inv(
+                np.transpose(x2_tilde) @ cov_est_srs @ x2_tilde
+            ) @ (np.transpose(x2_tilde) @ cov_est @ x2_tilde)
         except np.linalg.LinAlgError:
+            # eps = 1e-20
+            # x1_t = np.transpose(x1)
+            # x2_tilde = x2 - x1 @ np.linalg.inv(
+            #     x1_t @ cov_est_srs @ x1 + eps * np.eye(x1.shape[1])
+            # ) @ (x1_t @ cov_est_srs @ x2)
+            # delta_est = np.linalg.inv(
+            #     np.transpose(x2_tilde) @ cov_est_srs @ x2_tilde
+            #     + eps * np.eye(x2_tilde.shape[1])
+            # ) @ (np.transpose(x2_tilde) @ cov_est @ x2_tilde)
+            x1_t = np.transpose(x1)
+            x2_tilde = x2 - x1 @ np.linalg.pinv(
+                x1_t @ cov_est_srs @ x1, rcond=1e-15, hermitian=True
+            ) @ (x1_t @ cov_est_srs @ x2)
+            delta_est = np.linalg.pinv(
+                np.transpose(x2_tilde) @ cov_est_srs @ x2_tilde,
+                rcond=1e-15,
+                hermitian=True,
+            ) @ (np.transpose(x2_tilde) @ cov_est @ x2_tilde)
+
+            warnings.warn("Matrix is computationally singular, results may be inaccurate")
+        except:
             delta_est = np.zeros((nrows * ncols, nrows * ncols))
 
         keys = list(tbl_est.point_est.keys())
@@ -592,10 +669,18 @@ def tabulate(
             .drop("key")
         )
 
-        poin_est_dict = tbl_df.select(vars_names + ["point_est"]).rows_by_key(key=vars_names[0])
-        stderror_dict = tbl_df.select(vars_names + ["stderror"]).rows_by_key(key=vars_names[0])
-        lower_ci_dict = tbl_df.select(vars_names + ["lower_ci"]).rows_by_key(key=vars_names[0])
-        upper_ci_dict = tbl_df.select(vars_names + ["upper_ci"]).rows_by_key(key=vars_names[0])
+        poin_est_dict = tbl_df.select(vars_names + ["point_est"]).rows_by_key(
+            key=vars_names[0]
+        )
+        stderror_dict = tbl_df.select(vars_names + ["stderror"]).rows_by_key(
+            key=vars_names[0]
+        )
+        lower_ci_dict = tbl_df.select(vars_names + ["lower_ci"]).rows_by_key(
+            key=vars_names[0]
+        )
+        upper_ci_dict = tbl_df.select(vars_names + ["upper_ci"]).rows_by_key(
+            key=vars_names[0]
+        )
 
         for var1 in poin_est_dict:
             point_est = {}
@@ -613,31 +698,52 @@ def tabulate(
             self.upper_ci[var1] = upper_ci
 
         if self.param == PopParam.count:
-            tbl_df = tbl_df.with_columns((pl.col("point_est") / pl.col("point_est").sum()).alias("est_prop"))
+            tbl_df = tbl_df.with_columns(
+                (pl.col("point_est") / pl.col("point_est").sum()).alias("est_prop")
+            )
         elif self.param == PopParam.prop:
             tbl_df = tbl_df.with_columns(est_prop=pl.col("point_est"))
         else:
             raise ValueError("parameter must be 'count' or 'proportion'")
 
         tbl_df = (
             tbl_df.join(
-                other=tbl_df.group_by(vars_names[0]).agg(pl.col("est_prop").sum().alias("est_sum_var1")),
+                other=tbl_df.group_by(vars_names[0]).agg(
+                    pl.col("est_prop").sum().alias("est_sum_var1")
+                ),
                 on=vars_names[0],
                 how="inner",
             )
             .join(
-                other=tbl_df.group_by(vars_names[1]).agg(pl.col("est_prop").sum().alias("est_sum_var2")),
+                other=tbl_df.group_by(vars_names[1]).agg(
+                    pl.col("est_prop").sum().alias("est_sum_var2")
+                ),
                 on=vars_names[1],
                 how="inner",
             )
-            .with_columns(est_prop_null=pl.col("est_sum_var1") * pl.col("est_sum_var2") * pl.col("est_prop").sum())
+            .with_columns(
+                est_prop_null=pl.col("est_sum_var1")
+                * pl.col("est_sum_var2")
+                * pl.col("est_prop").sum()
+            )
         )
 
-        chisq_p = df.shape[0] * ((tbl_df["est_prop"] - tbl_df["est_prop_null"]) ** 2 / tbl_df["est_prop_null"]).sum()
+        chisq_p = (
+            df.shape[0]
+            * (
+                (tbl_df["est_prop"] - tbl_df["est_prop_null"]) ** 2
+                / tbl_df["est_prop_null"]
+            ).sum()
+        )
         chisq_lr = (
             2
             * df.shape[0]
-            * (tbl_df["est_prop"] * (tbl_df["est_prop"] / tbl_df["est_prop_null"]).log()).fill_nan(0).sum()
+            * (
+                tbl_df["est_prop"]
+                * (tbl_df["est_prop"] / tbl_df["est_prop_null"]).log()
+            )
+            .fill_nan(0)
+            .sum()
         )
 
         trace_delta = np.trace(delta_est)
@@ -698,19 +804,27 @@ def to_dataframe(
         for _ in range(len(self.row_levels)):
             for _ in range(len(self.col_levels)):
                 twoway_df[self.param] = sum(
-                    pd.DataFrame.from_dict(self.point_est, orient="index").values.tolist(),
+                    pd.DataFrame.from_dict(
+                        self.point_est, orient="index"
+                    ).values.tolist(),
                     [],
                 )
                 twoway_df["stderror"] = sum(
-                    pd.DataFrame.from_dict(self.stderror, orient="index").values.tolist(),
+                    pd.DataFrame.from_dict(
+                        self.stderror, orient="index"
+                    ).values.tolist(),
                     [],
                 )
                 twoway_df["lower_ci"] = sum(
-                    pd.DataFrame.from_dict(self.lower_ci, orient="index").values.tolist(),
+                    pd.DataFrame.from_dict(
+                        self.lower_ci, orient="index"
+                    ).values.tolist(),
                     [],
                 )
                 twoway_df["upper_ci"] = sum(
-                    pd.DataFrame.from_dict(self.upper_ci, orient="index").values.tolist(),
+                    pd.DataFrame.from_dict(
+                        self.upper_ci, orient="index"
+                    ).values.tolist(),
                     [],
                 )
         # twoway_df.sort_values(by=self.vars_names, inplace=True)

diff --git a/src/samplics/utils/__init__.py b/src/samplics/utils/__init__.py
@@ -15,7 +15,7 @@
     SelectMethod,
     SinglePSUEst,
     SizeMethod,
-    ModelType
+    ModelType,
 )
 
 
@@ -34,5 +34,5 @@
     "SinglePSUError",
     "ProbError",
     "MethodError",
-    "ModelType"
+    "ModelType",
 ]