Skip to content

Commit

Permalink
release 0.4.38
Browse files Browse the repository at this point in the history
  • Loading branch information
MamadouSDiallo committed Feb 18, 2025
1 parent 6ae973b commit a765a7a
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 45 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "samplics"
version = "0.4.37"
version = "0.4.38"
description = "Select, weight and analyze complex sample data"

authors = [{ name = "Mamadou S Diallo", email = "msdiallo@samplics.org" }]
Expand Down
2 changes: 1 addition & 1 deletion src/samplics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,4 @@
"SinglePSUError",
]

__version__ = "0.4.37"
__version__ = "0.4.38"
194 changes: 154 additions & 40 deletions src/samplics/categorical/tabulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from __future__ import annotations

import itertools
import warnings

from typing import Optional, Union

Expand Down Expand Up @@ -58,7 +59,9 @@ def __str__(self) -> str:
tbl_subhead1 = f" Number of strata: {self.design_info['nb_strata']}"
tbl_subhead2 = f" Number of PSUs: {self.design_info['nb_psus']}"
tbl_subhead3 = f" Number of observations: {self.design_info['nb_obs']}"
tbl_subhead4 = f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}"
tbl_subhead4 = (
f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}"
)

return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead3}\n{tbl_subhead4}\n\n {self.to_dataframe().to_string(index=False)}\n"

Expand All @@ -73,7 +76,9 @@ def _estimate(
fpc: Union[dict, float] = 1,
deff: bool = False,
coef_var: bool = False,
single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error,
single_psu: Union[
SinglePSUEst, dict[StringNumber, SinglePSUEst]
] = SinglePSUEst.error,
strata_comb: Optional[dict[Array, Array]] = None,
remove_nan: bool = False,
) -> tuple[TaylorEstimator, list, int]:
Expand All @@ -87,10 +92,14 @@ def _estimate(
ssu,
)
if var.ndim == 1: # Series
to_keep = to_keep & remove_nans(var.values.ravel().shape[0], var.values.ravel())
to_keep = to_keep & remove_nans(
var.values.ravel().shape[0], var.values.ravel()
)
elif var.ndim == 2: # DataFrame
for col in var.columns:
to_keep = to_keep & remove_nans(var.values.ravel().shape[0], var[col].values.ravel())
to_keep = to_keep & remove_nans(
var.values.ravel().shape[0], var[col].values.ravel()
)
else:
raise DimensionError("The dimension must be 1 or 2.")

Expand Down Expand Up @@ -153,7 +162,9 @@ def tabulate(
fpc: Union[dict, float] = 1,
deff: bool = False,
coef_var: bool = False,
single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error,
single_psu: Union[
SinglePSUEst, dict[StringNumber, SinglePSUEst]
] = SinglePSUEst.error,
strata_comb: Optional[dict[Array, Array]] = None,
remove_nan: bool = False,
) -> None:
Expand All @@ -179,12 +190,22 @@ def tabulate(
vars_names = set_variables_names(vars, varnames, prefix)

if len(vars_names) != nb_vars:
raise AssertionError("Length of varnames must be the same as the number of columns of vars")
raise AssertionError(
"Length of varnames must be the same as the number of columns of vars"
)

_samp_weight = numpy_array(samp_weight)

_samp_weight = np.ones(vars_df.shape[0]) if _samp_weight.shape in ((), (0,)) else _samp_weight
_samp_weight = np.repeat(_samp_weight, vars_df.shape[0]) if _samp_weight.shape[0] == 1 else _samp_weight
_samp_weight = (
np.ones(vars_df.shape[0])
if _samp_weight.shape in ((), (0,))
else _samp_weight
)
_samp_weight = (
np.repeat(_samp_weight, vars_df.shape[0])
if _samp_weight.shape[0] == 1
else _samp_weight
)
_stratum = numpy_array(stratum)
_psu = numpy_array(psu)
_ssu = numpy_array(ssu)
Expand Down Expand Up @@ -272,7 +293,9 @@ def to_dataframe(
oneway_df = pd.DataFrame([])

for var in self.vars_names:
var_df = pd.DataFrame(np.repeat(var, len(self.vars_levels[var])), columns=["variable"])
var_df = pd.DataFrame(
np.repeat(var, len(self.vars_levels[var])), columns=["variable"]
)
var_df["category"] = self.vars_levels[var]
var_df[self.param] = list(self.point_est[var].values())
var_df["stderror"] = list(self.stderror[var].values())
Expand Down Expand Up @@ -331,11 +354,15 @@ def __str__(self) -> str:
if self.vars_names == []:
return "No categorical variables to tabulate"
else:
tbl_head = f"Cross-tabulation of {self.vars_names[0]} and {self.vars_names[1]}"
tbl_head = (
f"Cross-tabulation of {self.vars_names[0]} and {self.vars_names[1]}"
)
tbl_subhead1 = f" Number of strata: {self.design_info['nb_strata']}"
tbl_subhead2 = f" Number of PSUs: {self.design_info['nb_psus']}"
tbl_subhead3 = f" Number of observations: {self.design_info['nb_obs']}"
tbl_subhead4 = f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}"
tbl_subhead4 = (
f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}"
)

chisq_dist = f"chi2({self.stats['Pearson-Unadj']['df']})"
f_dist = f"F({self.stats['Pearson-Adj']['df_num']:.2f}, {self.stats['Pearson-Adj']['df_den']:.2f}"
Expand All @@ -351,7 +378,9 @@ def __str__(self) -> str:
return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead3}\n{tbl_subhead4}\n\n {self.to_dataframe().to_string(index=False)}\n\n{pearson_test}\n\n {lr_test}\n"

# also mutates tbl_est
def _extract_estimates(self, tbl_est, vars_levels) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
def _extract_estimates(
self, tbl_est, vars_levels
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
levels = list(tbl_est.point_est.keys())
missing_levels = vars_levels[~np.isin(vars_levels, levels)]
if missing_levels.shape[0] > 0:
Expand All @@ -367,7 +396,9 @@ def _extract_estimates(self, tbl_est, vars_levels) -> tuple[np.ndarray, np.ndarr
else:
for ll in missing_levels:
tbl_est.covariance[level][ll] = 0.0
tbl_est.covariance[level] = dict(sorted(tbl_est.covariance[level].items()))
tbl_est.covariance[level] = dict(
sorted(tbl_est.covariance[level].items())
)

_tbl_est_point_est = dict(sorted(tbl_est.point_est.items()))
_tbl_est_covariance = dict(sorted(tbl_est.covariance.items()))
Expand All @@ -389,7 +420,9 @@ def tabulate(
fpc: Union[dict, float] = 1,
deff: bool = False,
coef_var: bool = False,
single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error,
single_psu: Union[
SinglePSUEst, dict[StringNumber, SinglePSUEst]
] = SinglePSUEst.error,
strata_comb: Optional[dict[Array, Array]] = None,
remove_nan: bool = False,
) -> None:
Expand Down Expand Up @@ -428,8 +461,16 @@ def tabulate(

df = vars.with_columns(
samp_weight=numpy_array(samp_weight),
stratum=(np.repeat("__none__", vars.shape[0]) if stratum is None else numpy_array(stratum)),
psu=(np.linspace(1, vars.shape[0], num=vars.shape[0]) if psu is None else numpy_array(psu)),
stratum=(
np.repeat("__none__", vars.shape[0])
if stratum is None
else numpy_array(stratum)
),
psu=(
np.linspace(1, vars.shape[0], num=vars.shape[0])
if psu is None
else numpy_array(psu)
),
ssu=np.repeat(1, vars.shape[0]) if ssu is None else numpy_array(ssu),
).filter(pl.col("samp_weight") > 0)

Expand All @@ -447,9 +488,18 @@ def tabulate(
if remove_nan:
df = (
df.filter(
(pl.col(vars_names[0]).is_not_null() & ~pl.col(vars_names[0]).eq("NaN"))
& (pl.col(vars_names[1]).is_not_null() & ~pl.col(vars_names[1]).eq("NaN"))
& (pl.col("samp_weight").is_not_null() & pl.col("samp_weight").is_not_nan())
(
pl.col(vars_names[0]).is_not_null()
& ~pl.col(vars_names[0]).eq("NaN")
)
& (
pl.col(vars_names[1]).is_not_null()
& ~pl.col(vars_names[1]).eq("NaN")
)
& (
pl.col("samp_weight").is_not_null()
& pl.col("samp_weight").is_not_nan()
)
& (pl.col("stratum").is_not_null() & ~pl.col("stratum").eq("NaN"))
& (pl.col("psu").is_not_null() & ~pl.col("psu").eq("NaN"))
& (pl.col("ssu").is_not_null() & ~pl.col("ssu").eq("NaN"))
Expand Down Expand Up @@ -482,9 +532,11 @@ def tabulate(
if len(df.shape) == 2:
vars_for_oneway = (
df.select(vars_names)
.with_columns((pl.col(vars_names[0]) + "__by__" + pl.col(vars_names[1])).alias("__cross_vars__"))[
"__cross_vars__"
]
.with_columns(
(pl.col(vars_names[0]) + "__by__" + pl.col(vars_names[1])).alias(
"__cross_vars__"
)
)["__cross_vars__"]
.to_numpy()
)
else:
Expand Down Expand Up @@ -530,7 +582,9 @@ def tabulate(
raise ValueError("parameter must be 'count' or 'proportion'")

cov_est_srs = (
np.diag(cell_est) - cell_est.reshape((cell_est.shape[0], 1)) @ cell_est.reshape((1, cell_est.shape[0]))
np.diag(cell_est)
- cell_est.reshape((cell_est.shape[0], 1))
@ cell_est.reshape((1, cell_est.shape[0]))
) / df.shape[0]
# cov_est_srs = cov_est_srs * ((df.shape[0] - 1) / df.shape[0])

Expand Down Expand Up @@ -558,11 +612,34 @@ def tabulate(

try:
x1_t = np.transpose(x1)
x2_tilde = x2 - x1 @ np.linalg.inv(x1_t @ cov_est_srs @ x1) @ (x1_t @ cov_est_srs @ x2)
delta_est = np.linalg.inv(np.transpose(x2_tilde) @ cov_est_srs @ x2_tilde) @ (
np.transpose(x2_tilde) @ cov_est @ x2_tilde
x2_tilde = x2 - x1 @ np.linalg.inv(x1_t @ cov_est_srs @ x1) @ (
x1_t @ cov_est_srs @ x2
)
delta_est = np.linalg.inv(
np.transpose(x2_tilde) @ cov_est_srs @ x2_tilde
) @ (np.transpose(x2_tilde) @ cov_est @ x2_tilde)
except np.linalg.LinAlgError:
# eps = 1e-20
# x1_t = np.transpose(x1)
# x2_tilde = x2 - x1 @ np.linalg.inv(
# x1_t @ cov_est_srs @ x1 + eps * np.eye(x1.shape[1])
# ) @ (x1_t @ cov_est_srs @ x2)
# delta_est = np.linalg.inv(
# np.transpose(x2_tilde) @ cov_est_srs @ x2_tilde
# + eps * np.eye(x2_tilde.shape[1])
# ) @ (np.transpose(x2_tilde) @ cov_est @ x2_tilde)
x1_t = np.transpose(x1)
x2_tilde = x2 - x1 @ np.linalg.pinv(
x1_t @ cov_est_srs @ x1, rcond=1e-15, hermitian=True
) @ (x1_t @ cov_est_srs @ x2)
delta_est = np.linalg.pinv(
np.transpose(x2_tilde) @ cov_est_srs @ x2_tilde,
rcond=1e-15,
hermitian=True,
) @ (np.transpose(x2_tilde) @ cov_est @ x2_tilde)

warnings.warn("Matrix is computationally singular, results may be inaccurate")
except:
delta_est = np.zeros((nrows * ncols, nrows * ncols))

keys = list(tbl_est.point_est.keys())
Expand Down Expand Up @@ -592,10 +669,18 @@ def tabulate(
.drop("key")
)

poin_est_dict = tbl_df.select(vars_names + ["point_est"]).rows_by_key(key=vars_names[0])
stderror_dict = tbl_df.select(vars_names + ["stderror"]).rows_by_key(key=vars_names[0])
lower_ci_dict = tbl_df.select(vars_names + ["lower_ci"]).rows_by_key(key=vars_names[0])
upper_ci_dict = tbl_df.select(vars_names + ["upper_ci"]).rows_by_key(key=vars_names[0])
poin_est_dict = tbl_df.select(vars_names + ["point_est"]).rows_by_key(
key=vars_names[0]
)
stderror_dict = tbl_df.select(vars_names + ["stderror"]).rows_by_key(
key=vars_names[0]
)
lower_ci_dict = tbl_df.select(vars_names + ["lower_ci"]).rows_by_key(
key=vars_names[0]
)
upper_ci_dict = tbl_df.select(vars_names + ["upper_ci"]).rows_by_key(
key=vars_names[0]
)

for var1 in poin_est_dict:
point_est = {}
Expand All @@ -613,31 +698,52 @@ def tabulate(
self.upper_ci[var1] = upper_ci

if self.param == PopParam.count:
tbl_df = tbl_df.with_columns((pl.col("point_est") / pl.col("point_est").sum()).alias("est_prop"))
tbl_df = tbl_df.with_columns(
(pl.col("point_est") / pl.col("point_est").sum()).alias("est_prop")
)
elif self.param == PopParam.prop:
tbl_df = tbl_df.with_columns(est_prop=pl.col("point_est"))
else:
raise ValueError("parameter must be 'count' or 'proportion'")

tbl_df = (
tbl_df.join(
other=tbl_df.group_by(vars_names[0]).agg(pl.col("est_prop").sum().alias("est_sum_var1")),
other=tbl_df.group_by(vars_names[0]).agg(
pl.col("est_prop").sum().alias("est_sum_var1")
),
on=vars_names[0],
how="inner",
)
.join(
other=tbl_df.group_by(vars_names[1]).agg(pl.col("est_prop").sum().alias("est_sum_var2")),
other=tbl_df.group_by(vars_names[1]).agg(
pl.col("est_prop").sum().alias("est_sum_var2")
),
on=vars_names[1],
how="inner",
)
.with_columns(est_prop_null=pl.col("est_sum_var1") * pl.col("est_sum_var2") * pl.col("est_prop").sum())
.with_columns(
est_prop_null=pl.col("est_sum_var1")
* pl.col("est_sum_var2")
* pl.col("est_prop").sum()
)
)

chisq_p = df.shape[0] * ((tbl_df["est_prop"] - tbl_df["est_prop_null"]) ** 2 / tbl_df["est_prop_null"]).sum()
chisq_p = (
df.shape[0]
* (
(tbl_df["est_prop"] - tbl_df["est_prop_null"]) ** 2
/ tbl_df["est_prop_null"]
).sum()
)
chisq_lr = (
2
* df.shape[0]
* (tbl_df["est_prop"] * (tbl_df["est_prop"] / tbl_df["est_prop_null"]).log()).fill_nan(0).sum()
* (
tbl_df["est_prop"]
* (tbl_df["est_prop"] / tbl_df["est_prop_null"]).log()
)
.fill_nan(0)
.sum()
)

trace_delta = np.trace(delta_est)
Expand Down Expand Up @@ -698,19 +804,27 @@ def to_dataframe(
for _ in range(len(self.row_levels)):
for _ in range(len(self.col_levels)):
twoway_df[self.param] = sum(
pd.DataFrame.from_dict(self.point_est, orient="index").values.tolist(),
pd.DataFrame.from_dict(
self.point_est, orient="index"
).values.tolist(),
[],
)
twoway_df["stderror"] = sum(
pd.DataFrame.from_dict(self.stderror, orient="index").values.tolist(),
pd.DataFrame.from_dict(
self.stderror, orient="index"
).values.tolist(),
[],
)
twoway_df["lower_ci"] = sum(
pd.DataFrame.from_dict(self.lower_ci, orient="index").values.tolist(),
pd.DataFrame.from_dict(
self.lower_ci, orient="index"
).values.tolist(),
[],
)
twoway_df["upper_ci"] = sum(
pd.DataFrame.from_dict(self.upper_ci, orient="index").values.tolist(),
pd.DataFrame.from_dict(
self.upper_ci, orient="index"
).values.tolist(),
[],
)
# twoway_df.sort_values(by=self.vars_names, inplace=True)
Expand Down
4 changes: 2 additions & 2 deletions src/samplics/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
SelectMethod,
SinglePSUEst,
SizeMethod,
ModelType
ModelType,
)


Expand All @@ -34,5 +34,5 @@
"SinglePSUError",
"ProbError",
"MethodError",
"ModelType"
"ModelType",
]
Loading

0 comments on commit a765a7a

Please sign in to comment.