Skip to content

Commit 990102c

Browse files
committed
check for zero variance
1 parent d798687 commit 990102c

File tree

1 file changed

+146
-113
lines changed
  • pgscatalog.calc/src/pgscatalog/calc/lib/_ancestry

1 file changed

+146
-113
lines changed

pgscatalog.calc/src/pgscatalog/calc/lib/_ancestry/tools.py

+146-113
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,8 @@ def pgs_adjust(
280280
:param norm2_2step: boolean (default=False) whether to use the two-step model vs. the full-fit
281281
:param ref_train_col: column name with true/false labels of samples that should be included in training PGS methods
282282
:param n_pcs: number of genetic PCs that will be used for PGS-adjustment
283-
:return: [results_ref:df , results_target:df , results_models: dict] adjusted dfs for reference and target populations, and a dictionary with model fit/parameters.
283+
:return: [results_ref:df , results_target:df , results_models: dict] adjusted dfs for reference and target
284+
populations, and a dictionary with model fit/parameters.
284285
"""
285286
# Check that datasets have the correct columns
286287
## Check that score is in both dfs
@@ -330,13 +331,21 @@ def pgs_adjust(
330331
results_ref = {}
331332
results_target = {}
332333
results_models = {} # used to store regression information
334+
scorecols_drop = set()
333335
for c_pgs in scorecols:
334336
# Makes melting easier later
335337
sum_col = "SUM|{}".format(c_pgs)
336338
results_ref[sum_col] = ref_df[c_pgs]
337339
results_target[sum_col] = target_df[c_pgs]
338340
results_models = {}
339341

342+
# Check that PGS has variance (e.g. not all 0)
343+
if 0 in [np.var(results_ref[sum_col]), np.var(results_target[sum_col])]:
344+
scorecols_drop.add(c_pgs)
345+
logger.warning(
346+
"Skipping adjustment: {} has 0 variance in PGS SUM".format(c_pgs)
347+
)
348+
340349
# Report PGS values with respect to distribution of PGS in the most similar reference population
341350
if "empirical" in use_method:
342351
logger.debug(
@@ -355,44 +364,49 @@ def pgs_adjust(
355364
results_ref[z_col] = pd.Series(index=ref_df.index, dtype="float64")
356365
results_target[z_col] = pd.Series(index=target_df.index, dtype="float64")
357366

358-
r_model = {}
367+
if c_pgs not in scorecols_drop:
368+
r_model = {}
359369

360-
# Adjust for each population
361-
for pop in ref_populations:
362-
r_pop = {}
363-
i_ref_pop = ref_df[ref_pop_col] == pop
364-
i_target_pop = target_df[target_pop_col] == pop
370+
# Adjust for each population
371+
for pop in ref_populations:
372+
r_pop = {}
373+
i_ref_pop = ref_df[ref_pop_col] == pop
374+
i_target_pop = target_df[target_pop_col] == pop
365375

366-
# Reference Score Distribution
367-
c_pgs_pop_dist = ref_train_df.loc[
368-
ref_train_df[ref_pop_col] == pop, c_pgs
369-
]
376+
# Reference Score Distribution
377+
c_pgs_pop_dist = ref_train_df.loc[
378+
ref_train_df[ref_pop_col] == pop, c_pgs
379+
]
370380

371-
# Calculate Percentile
372-
results_ref[percentile_col].loc[i_ref_pop] = percentileofscore(
373-
c_pgs_pop_dist, ref_df.loc[i_ref_pop, c_pgs]
374-
)
375-
results_target[percentile_col].loc[i_target_pop] = percentileofscore(
376-
c_pgs_pop_dist, target_df.loc[i_target_pop, c_pgs]
377-
)
378-
r_pop["percentiles"] = np.percentile(c_pgs_pop_dist, range(0, 101, 1))
381+
# Calculate Percentile
382+
results_ref[percentile_col].loc[i_ref_pop] = percentileofscore(
383+
c_pgs_pop_dist, ref_df.loc[i_ref_pop, c_pgs]
384+
)
385+
results_target[percentile_col].loc[
386+
i_target_pop
387+
] = percentileofscore(
388+
c_pgs_pop_dist, target_df.loc[i_target_pop, c_pgs]
389+
)
390+
r_pop["percentiles"] = np.percentile(
391+
c_pgs_pop_dist, range(0, 101, 1)
392+
)
379393

380-
# Calculate Z
381-
r_pop["mean"] = c_pgs_pop_dist.mean()
382-
r_pop["std"] = c_pgs_pop_dist.std(ddof=0)
394+
# Calculate Z
395+
r_pop["mean"] = c_pgs_pop_dist.mean()
396+
r_pop["std"] = c_pgs_pop_dist.std(ddof=0)
383397

384-
results_ref[z_col].loc[i_ref_pop] = (
385-
ref_df.loc[i_ref_pop, c_pgs] - r_pop["mean"]
386-
) / r_pop["std"]
387-
results_target[z_col].loc[i_target_pop] = (
388-
target_df.loc[i_target_pop, c_pgs] - r_pop["mean"]
389-
) / r_pop["std"]
398+
results_ref[z_col].loc[i_ref_pop] = (
399+
ref_df.loc[i_ref_pop, c_pgs] - r_pop["mean"]
400+
) / r_pop["std"]
401+
results_target[z_col].loc[i_target_pop] = (
402+
target_df.loc[i_target_pop, c_pgs] - r_pop["mean"]
403+
) / r_pop["std"]
390404

391-
r_model[pop] = r_pop
405+
r_model[pop] = r_pop
392406

393-
results_models["dist_empirical"][c_pgs] = r_model
394-
# ToDo: explore handling of individuals who have low-confidence population labels
395-
# -> Possible Soln: weighted average based on probabilities? Small Mahalanobis P-values will complicate this
407+
results_models["dist_empirical"][c_pgs] = r_model
408+
# ToDo: explore handling of individuals who have low-confidence population labels
409+
# -> Possible Soln: weighted average based on probabilities? Small Mahalanobis P-values will complicate this
396410
# PCA-based adjustment
397411
if any([x in use_method for x in ["mean", "mean+var"]]):
398412
logger.debug("Adjusting PGS using PCA projections")
@@ -418,92 +432,111 @@ def pgs_adjust(
418432
target_norm[pc_col] = (target_norm[pc_col] - pc_mean) / pc_std
419433

420434
for c_pgs in scorecols:
421-
results_models["adjust_pcs"]["PGS"][c_pgs] = {}
422-
if norm_centerpgs:
423-
pgs_mean = ref_train_df[c_pgs].mean()
424-
ref_train_df[c_pgs] = ref_train_df[c_pgs] - pgs_mean
425-
ref_norm[c_pgs] = ref_norm[c_pgs] - pgs_mean
426-
target_norm[c_pgs] = target_norm[c_pgs] - pgs_mean
427-
results_models["adjust_pcs"]["PGS"][c_pgs]["pgs_offset"] = pgs_mean
428-
429-
# Method 1 (Khera et al. Circulation (2019): normalize mean (doi:10.1161/CIRCULATIONAHA.118.035658)
430-
adj_col = "Z_norm1|{}".format(c_pgs)
431-
# Fit to Reference Data
432-
pcs2pgs_fit = LinearRegression().fit(
433-
ref_train_df[cols_pcs], ref_train_df[c_pgs]
434-
)
435-
ref_train_pgs_pred = pcs2pgs_fit.predict(ref_train_df[cols_pcs])
436-
ref_train_pgs_resid = ref_train_df[c_pgs] - ref_train_pgs_pred
437-
ref_train_pgs_resid_mean = ref_train_pgs_resid.mean()
438-
ref_train_pgs_resid_std = ref_train_pgs_resid.std(ddof=0)
439-
440-
ref_pgs_resid = ref_norm[c_pgs] - pcs2pgs_fit.predict(ref_norm[cols_pcs])
441-
results_ref[adj_col] = ref_pgs_resid / ref_train_pgs_resid_std
442-
# Apply to Target Data
443-
target_pgs_pred = pcs2pgs_fit.predict(target_norm[cols_pcs])
444-
target_pgs_resid = target_norm[c_pgs] - target_pgs_pred
445-
results_target[adj_col] = target_pgs_resid / ref_train_pgs_resid_std
446-
results_models["adjust_pcs"]["PGS"][c_pgs][
447-
"Z_norm1"
448-
] = package_skl_regression(pcs2pgs_fit)
449-
450-
if "mean+var" in use_method:
451-
# Method 2 (Khan et al. Nature Medicine (2022)): normalize variance (doi:10.1038/s41591-022-01869-1)
452-
# Normalize based on residual deviation from mean of the distribution [equalize population sds]
453-
# (e.g. reduce the correlation between genetic ancestry and how far away you are from the mean)
454-
# USE gamma distribution for predicted variance to constrain it to be positive (b/c using linear
455-
# regression we can get negative predictions for the sd)
456-
adj_col = "Z_norm2|{}".format(c_pgs)
457-
pcs2var_fit_gamma = GammaRegressor(max_iter=1000).fit(
458-
ref_train_df[cols_pcs],
459-
(ref_train_pgs_resid - ref_train_pgs_resid_mean) ** 2,
435+
if c_pgs in scorecols_drop:
436+
# fill the output with NAs
437+
adj_cols = ["Z_norm1|{}".format(c_pgs)]
438+
if "mean+var" in use_method:
439+
adj_cols.append("Z_norm2|{}".format(c_pgs))
440+
for adj_col in adj_cols:
441+
results_ref[adj_col] = pd.Series(
442+
index=ref_df.index, dtype="float64"
443+
) # fill na
444+
results_target[adj_col] = pd.Series(
445+
index=target_df.index, dtype="float64"
446+
) # fill na
447+
else:
448+
results_models["adjust_pcs"]["PGS"][c_pgs] = {}
449+
if norm_centerpgs:
450+
pgs_mean = ref_train_df[c_pgs].mean()
451+
ref_train_df[c_pgs] = ref_train_df[c_pgs] - pgs_mean
452+
ref_norm[c_pgs] = ref_norm[c_pgs] - pgs_mean
453+
target_norm[c_pgs] = target_norm[c_pgs] - pgs_mean
454+
results_models["adjust_pcs"]["PGS"][c_pgs]["pgs_offset"] = pgs_mean
455+
456+
# Method 1 (Khera et al. Circulation (2019): normalize mean (doi:10.1161/CIRCULATIONAHA.118.035658)
457+
adj_col = "Z_norm1|{}".format(c_pgs)
458+
# Fit to Reference Data
459+
pcs2pgs_fit = LinearRegression().fit(
460+
ref_train_df[cols_pcs], ref_train_df[c_pgs]
460461
)
461-
if norm2_2step:
462-
# Return 2-step adjustment
463-
results_ref[adj_col] = ref_pgs_resid / np.sqrt(
464-
pcs2var_fit_gamma.predict(ref_norm[cols_pcs])
465-
)
466-
results_target[adj_col] = target_pgs_resid / np.sqrt(
467-
pcs2var_fit_gamma.predict(target_norm[cols_pcs])
468-
)
469-
results_models["adjust_pcs"]["PGS"][c_pgs][
470-
"Z_norm2"
471-
] = package_skl_regression(pcs2var_fit_gamma)
472-
else:
473-
# Return full-likelihood adjustment model
474-
# This jointly re-fits the regression parameters from the mean and variance prediction to better
475-
# fit the observed PGS distribution. It seems to mostly change the intercepts. This implementation is
476-
# adapted from https://github.com/broadinstitute/palantir-workflows/blob/v0.14/ImputationPipeline/ScoringTasks.wdl,
477-
# which is distributed under a BDS-3 license.
478-
params_initial = np.concatenate(
479-
[
480-
[pcs2pgs_fit.intercept_],
481-
pcs2pgs_fit.coef_,
482-
[pcs2var_fit_gamma.intercept_],
483-
pcs2var_fit_gamma.coef_,
484-
]
485-
)
486-
pcs2full_fit = fullLL_fit(
487-
df_score=ref_train_df,
488-
scorecol=c_pgs,
489-
predictors=cols_pcs,
490-
initial_params=params_initial,
491-
)
462+
ref_train_pgs_pred = pcs2pgs_fit.predict(ref_train_df[cols_pcs])
463+
ref_train_pgs_resid = ref_train_df[c_pgs] - ref_train_pgs_pred
464+
ref_train_pgs_resid_mean = ref_train_pgs_resid.mean()
465+
ref_train_pgs_resid_std = ref_train_pgs_resid.std(ddof=0)
492466

493-
results_ref[adj_col] = fullLL_adjust(pcs2full_fit, ref_norm, c_pgs)
494-
results_target[adj_col] = fullLL_adjust(
495-
pcs2full_fit, target_norm, c_pgs
467+
ref_pgs_resid = ref_norm[c_pgs] - pcs2pgs_fit.predict(
468+
ref_norm[cols_pcs]
469+
)
470+
results_ref[adj_col] = ref_pgs_resid / ref_train_pgs_resid_std
471+
# Apply to Target Data
472+
target_pgs_pred = pcs2pgs_fit.predict(target_norm[cols_pcs])
473+
target_pgs_resid = target_norm[c_pgs] - target_pgs_pred
474+
results_target[adj_col] = target_pgs_resid / ref_train_pgs_resid_std
475+
results_models["adjust_pcs"]["PGS"][c_pgs][
476+
"Z_norm1"
477+
] = package_skl_regression(pcs2pgs_fit)
478+
479+
if "mean+var" in use_method:
480+
# Method 2 (Khan et al. Nature Medicine (2022)): normalize variance (doi:10.1038/s41591-022-01869-1)
481+
# Normalize based on residual deviation from mean of the distribution [equalize population sds]
482+
# (e.g. reduce the correlation between genetic ancestry and how far away you are from the mean)
483+
# USE gamma distribution for predicted variance to constrain it to be positive (b/c using linear
484+
# regression we can get negative predictions for the sd)
485+
adj_col = "Z_norm2|{}".format(c_pgs)
486+
pcs2var_fit_gamma = GammaRegressor(max_iter=1000).fit(
487+
ref_train_df[cols_pcs],
488+
(ref_train_pgs_resid - ref_train_pgs_resid_mean) ** 2,
496489
)
490+
if norm2_2step:
491+
# Return 2-step adjustment
492+
results_ref[adj_col] = ref_pgs_resid / np.sqrt(
493+
pcs2var_fit_gamma.predict(ref_norm[cols_pcs])
494+
)
495+
results_target[adj_col] = target_pgs_resid / np.sqrt(
496+
pcs2var_fit_gamma.predict(target_norm[cols_pcs])
497+
)
498+
results_models["adjust_pcs"]["PGS"][c_pgs][
499+
"Z_norm2"
500+
] = package_skl_regression(pcs2var_fit_gamma)
501+
else:
502+
# Return full-likelihood adjustment model
503+
# This jointly re-fits the regression parameters from the mean and variance prediction to better
504+
# fit the observed PGS distribution. It seems to mostly change the intercepts. This implementation is
505+
# adapted from https://github.com/broadinstitute/palantir-workflows/blob/v0.14/ImputationPipeline/ScoringTasks.wdl,
506+
# which is distributed under a BDS-3 license.
507+
params_initial = np.concatenate(
508+
[
509+
[pcs2pgs_fit.intercept_],
510+
pcs2pgs_fit.coef_,
511+
[pcs2var_fit_gamma.intercept_],
512+
pcs2var_fit_gamma.coef_,
513+
]
514+
)
515+
pcs2full_fit = fullLL_fit(
516+
df_score=ref_train_df,
517+
scorecol=c_pgs,
518+
predictors=cols_pcs,
519+
initial_params=params_initial,
520+
)
497521

498-
if pcs2full_fit["params"]["success"] is False:
499-
logger.warning(
500-
"{} full-likelihood: {} {}".format(
501-
c_pgs,
502-
pcs2full_fit["params"]["status"],
503-
pcs2full_fit["params"]["message"],
504-
)
522+
results_ref[adj_col] = fullLL_adjust(
523+
pcs2full_fit, ref_norm, c_pgs
524+
)
525+
results_target[adj_col] = fullLL_adjust(
526+
pcs2full_fit, target_norm, c_pgs
505527
)
506-
results_models["adjust_pcs"]["PGS"][c_pgs]["Z_norm2"] = pcs2full_fit
528+
529+
if pcs2full_fit["params"]["success"] is False:
530+
logger.warning(
531+
"{} full-likelihood: {} {}".format(
532+
c_pgs,
533+
pcs2full_fit["params"]["status"],
534+
pcs2full_fit["params"]["message"],
535+
)
536+
)
537+
results_models["adjust_pcs"]["PGS"][c_pgs][
538+
"Z_norm2"
539+
] = pcs2full_fit
507540
# Only return results
508541
logger.debug("Outputting adjusted PGS & models")
509542
results_ref = pd.DataFrame(results_ref)

0 commit comments

Comments
 (0)