@@ -280,7 +280,8 @@ def pgs_adjust(
280
280
:param norm2_2step: boolean (default=False) whether to use the two-step model vs. the full-fit
281
281
:param ref_train_col: column name with true/false labels of samples that should be included in training PGS methods
282
282
:param n_pcs: number of genetic PCs that will be used for PGS-adjustment
283
- :return: [results_ref:df , results_target:df , results_models: dict] adjusted dfs for reference and target populations, and a dictionary with model fit/parameters.
283
+ :return: [results_ref:df , results_target:df , results_models: dict] adjusted dfs for reference and target
284
+ populations, and a dictionary with model fit/parameters.
284
285
"""
285
286
# Check that datasets have the correct columns
286
287
## Check that score is in both dfs
@@ -330,13 +331,21 @@ def pgs_adjust(
330
331
results_ref = {}
331
332
results_target = {}
332
333
results_models = {} # used to store regression information
334
+ scorecols_drop = set ()
333
335
for c_pgs in scorecols :
334
336
# Makes melting easier later
335
337
sum_col = "SUM|{}" .format (c_pgs )
336
338
results_ref [sum_col ] = ref_df [c_pgs ]
337
339
results_target [sum_col ] = target_df [c_pgs ]
338
340
results_models = {}
339
341
342
+ # Check that PGS has variance (e.g. not all 0)
343
+ if 0 in [np .var (results_ref [sum_col ]), np .var (results_target [sum_col ])]:
344
+ scorecols_drop .add (c_pgs )
345
+ logger .warning (
346
+ "Skipping adjustment: {} has 0 variance in PGS SUM" .format (c_pgs )
347
+ )
348
+
340
349
# Report PGS values with respect to distribution of PGS in the most similar reference population
341
350
if "empirical" in use_method :
342
351
logger .debug (
@@ -355,44 +364,49 @@ def pgs_adjust(
355
364
results_ref [z_col ] = pd .Series (index = ref_df .index , dtype = "float64" )
356
365
results_target [z_col ] = pd .Series (index = target_df .index , dtype = "float64" )
357
366
358
- r_model = {}
367
+ if c_pgs not in scorecols_drop :
368
+ r_model = {}
359
369
360
- # Adjust for each population
361
- for pop in ref_populations :
362
- r_pop = {}
363
- i_ref_pop = ref_df [ref_pop_col ] == pop
364
- i_target_pop = target_df [target_pop_col ] == pop
370
+ # Adjust for each population
371
+ for pop in ref_populations :
372
+ r_pop = {}
373
+ i_ref_pop = ref_df [ref_pop_col ] == pop
374
+ i_target_pop = target_df [target_pop_col ] == pop
365
375
366
- # Reference Score Distribution
367
- c_pgs_pop_dist = ref_train_df .loc [
368
- ref_train_df [ref_pop_col ] == pop , c_pgs
369
- ]
376
+ # Reference Score Distribution
377
+ c_pgs_pop_dist = ref_train_df .loc [
378
+ ref_train_df [ref_pop_col ] == pop , c_pgs
379
+ ]
370
380
371
- # Calculate Percentile
372
- results_ref [percentile_col ].loc [i_ref_pop ] = percentileofscore (
373
- c_pgs_pop_dist , ref_df .loc [i_ref_pop , c_pgs ]
374
- )
375
- results_target [percentile_col ].loc [i_target_pop ] = percentileofscore (
376
- c_pgs_pop_dist , target_df .loc [i_target_pop , c_pgs ]
377
- )
378
- r_pop ["percentiles" ] = np .percentile (c_pgs_pop_dist , range (0 , 101 , 1 ))
381
+ # Calculate Percentile
382
+ results_ref [percentile_col ].loc [i_ref_pop ] = percentileofscore (
383
+ c_pgs_pop_dist , ref_df .loc [i_ref_pop , c_pgs ]
384
+ )
385
+ results_target [percentile_col ].loc [
386
+ i_target_pop
387
+ ] = percentileofscore (
388
+ c_pgs_pop_dist , target_df .loc [i_target_pop , c_pgs ]
389
+ )
390
+ r_pop ["percentiles" ] = np .percentile (
391
+ c_pgs_pop_dist , range (0 , 101 , 1 )
392
+ )
379
393
380
- # Calculate Z
381
- r_pop ["mean" ] = c_pgs_pop_dist .mean ()
382
- r_pop ["std" ] = c_pgs_pop_dist .std (ddof = 0 )
394
+ # Calculate Z
395
+ r_pop ["mean" ] = c_pgs_pop_dist .mean ()
396
+ r_pop ["std" ] = c_pgs_pop_dist .std (ddof = 0 )
383
397
384
- results_ref [z_col ].loc [i_ref_pop ] = (
385
- ref_df .loc [i_ref_pop , c_pgs ] - r_pop ["mean" ]
386
- ) / r_pop ["std" ]
387
- results_target [z_col ].loc [i_target_pop ] = (
388
- target_df .loc [i_target_pop , c_pgs ] - r_pop ["mean" ]
389
- ) / r_pop ["std" ]
398
+ results_ref [z_col ].loc [i_ref_pop ] = (
399
+ ref_df .loc [i_ref_pop , c_pgs ] - r_pop ["mean" ]
400
+ ) / r_pop ["std" ]
401
+ results_target [z_col ].loc [i_target_pop ] = (
402
+ target_df .loc [i_target_pop , c_pgs ] - r_pop ["mean" ]
403
+ ) / r_pop ["std" ]
390
404
391
- r_model [pop ] = r_pop
405
+ r_model [pop ] = r_pop
392
406
393
- results_models ["dist_empirical" ][c_pgs ] = r_model
394
- # ToDo: explore handling of individuals who have low-confidence population labels
395
- # -> Possible Soln: weighted average based on probabilities? Small Mahalanobis P-values will complicate this
407
+ results_models ["dist_empirical" ][c_pgs ] = r_model
408
+ # ToDo: explore handling of individuals who have low-confidence population labels
409
+ # -> Possible Soln: weighted average based on probabilities? Small Mahalanobis P-values will complicate this
396
410
# PCA-based adjustment
397
411
if any ([x in use_method for x in ["mean" , "mean+var" ]]):
398
412
logger .debug ("Adjusting PGS using PCA projections" )
@@ -418,92 +432,111 @@ def pgs_adjust(
418
432
target_norm [pc_col ] = (target_norm [pc_col ] - pc_mean ) / pc_std
419
433
420
434
for c_pgs in scorecols :
421
- results_models ["adjust_pcs" ]["PGS" ][c_pgs ] = {}
422
- if norm_centerpgs :
423
- pgs_mean = ref_train_df [c_pgs ].mean ()
424
- ref_train_df [c_pgs ] = ref_train_df [c_pgs ] - pgs_mean
425
- ref_norm [c_pgs ] = ref_norm [c_pgs ] - pgs_mean
426
- target_norm [c_pgs ] = target_norm [c_pgs ] - pgs_mean
427
- results_models ["adjust_pcs" ]["PGS" ][c_pgs ]["pgs_offset" ] = pgs_mean
428
-
429
- # Method 1 (Khera et al. Circulation (2019): normalize mean (doi:10.1161/CIRCULATIONAHA.118.035658)
430
- adj_col = "Z_norm1|{}" .format (c_pgs )
431
- # Fit to Reference Data
432
- pcs2pgs_fit = LinearRegression ().fit (
433
- ref_train_df [cols_pcs ], ref_train_df [c_pgs ]
434
- )
435
- ref_train_pgs_pred = pcs2pgs_fit .predict (ref_train_df [cols_pcs ])
436
- ref_train_pgs_resid = ref_train_df [c_pgs ] - ref_train_pgs_pred
437
- ref_train_pgs_resid_mean = ref_train_pgs_resid .mean ()
438
- ref_train_pgs_resid_std = ref_train_pgs_resid .std (ddof = 0 )
439
-
440
- ref_pgs_resid = ref_norm [c_pgs ] - pcs2pgs_fit .predict (ref_norm [cols_pcs ])
441
- results_ref [adj_col ] = ref_pgs_resid / ref_train_pgs_resid_std
442
- # Apply to Target Data
443
- target_pgs_pred = pcs2pgs_fit .predict (target_norm [cols_pcs ])
444
- target_pgs_resid = target_norm [c_pgs ] - target_pgs_pred
445
- results_target [adj_col ] = target_pgs_resid / ref_train_pgs_resid_std
446
- results_models ["adjust_pcs" ]["PGS" ][c_pgs ][
447
- "Z_norm1"
448
- ] = package_skl_regression (pcs2pgs_fit )
449
-
450
- if "mean+var" in use_method :
451
- # Method 2 (Khan et al. Nature Medicine (2022)): normalize variance (doi:10.1038/s41591-022-01869-1)
452
- # Normalize based on residual deviation from mean of the distribution [equalize population sds]
453
- # (e.g. reduce the correlation between genetic ancestry and how far away you are from the mean)
454
- # USE gamma distribution for predicted variance to constrain it to be positive (b/c using linear
455
- # regression we can get negative predictions for the sd)
456
- adj_col = "Z_norm2|{}" .format (c_pgs )
457
- pcs2var_fit_gamma = GammaRegressor (max_iter = 1000 ).fit (
458
- ref_train_df [cols_pcs ],
459
- (ref_train_pgs_resid - ref_train_pgs_resid_mean ) ** 2 ,
435
+ if c_pgs in scorecols_drop :
436
+ # fill the output with NAs
437
+ adj_cols = ["Z_norm1|{}" .format (c_pgs )]
438
+ if "mean+var" in use_method :
439
+ adj_cols .append ("Z_norm2|{}" .format (c_pgs ))
440
+ for adj_col in adj_cols :
441
+ results_ref [adj_col ] = pd .Series (
442
+ index = ref_df .index , dtype = "float64"
443
+ ) # fill na
444
+ results_target [adj_col ] = pd .Series (
445
+ index = target_df .index , dtype = "float64"
446
+ ) # fill na
447
+ else :
448
+ results_models ["adjust_pcs" ]["PGS" ][c_pgs ] = {}
449
+ if norm_centerpgs :
450
+ pgs_mean = ref_train_df [c_pgs ].mean ()
451
+ ref_train_df [c_pgs ] = ref_train_df [c_pgs ] - pgs_mean
452
+ ref_norm [c_pgs ] = ref_norm [c_pgs ] - pgs_mean
453
+ target_norm [c_pgs ] = target_norm [c_pgs ] - pgs_mean
454
+ results_models ["adjust_pcs" ]["PGS" ][c_pgs ]["pgs_offset" ] = pgs_mean
455
+
456
+ # Method 1 (Khera et al. Circulation (2019): normalize mean (doi:10.1161/CIRCULATIONAHA.118.035658)
457
+ adj_col = "Z_norm1|{}" .format (c_pgs )
458
+ # Fit to Reference Data
459
+ pcs2pgs_fit = LinearRegression ().fit (
460
+ ref_train_df [cols_pcs ], ref_train_df [c_pgs ]
460
461
)
461
- if norm2_2step :
462
- # Return 2-step adjustment
463
- results_ref [adj_col ] = ref_pgs_resid / np .sqrt (
464
- pcs2var_fit_gamma .predict (ref_norm [cols_pcs ])
465
- )
466
- results_target [adj_col ] = target_pgs_resid / np .sqrt (
467
- pcs2var_fit_gamma .predict (target_norm [cols_pcs ])
468
- )
469
- results_models ["adjust_pcs" ]["PGS" ][c_pgs ][
470
- "Z_norm2"
471
- ] = package_skl_regression (pcs2var_fit_gamma )
472
- else :
473
- # Return full-likelihood adjustment model
474
- # This jointly re-fits the regression parameters from the mean and variance prediction to better
475
- # fit the observed PGS distribution. It seems to mostly change the intercepts. This implementation is
476
- # adapted from https://github.com/broadinstitute/palantir-workflows/blob/v0.14/ImputationPipeline/ScoringTasks.wdl,
477
- # which is distributed under a BDS-3 license.
478
- params_initial = np .concatenate (
479
- [
480
- [pcs2pgs_fit .intercept_ ],
481
- pcs2pgs_fit .coef_ ,
482
- [pcs2var_fit_gamma .intercept_ ],
483
- pcs2var_fit_gamma .coef_ ,
484
- ]
485
- )
486
- pcs2full_fit = fullLL_fit (
487
- df_score = ref_train_df ,
488
- scorecol = c_pgs ,
489
- predictors = cols_pcs ,
490
- initial_params = params_initial ,
491
- )
462
+ ref_train_pgs_pred = pcs2pgs_fit .predict (ref_train_df [cols_pcs ])
463
+ ref_train_pgs_resid = ref_train_df [c_pgs ] - ref_train_pgs_pred
464
+ ref_train_pgs_resid_mean = ref_train_pgs_resid .mean ()
465
+ ref_train_pgs_resid_std = ref_train_pgs_resid .std (ddof = 0 )
492
466
493
- results_ref [adj_col ] = fullLL_adjust (pcs2full_fit , ref_norm , c_pgs )
494
- results_target [adj_col ] = fullLL_adjust (
495
- pcs2full_fit , target_norm , c_pgs
467
+ ref_pgs_resid = ref_norm [c_pgs ] - pcs2pgs_fit .predict (
468
+ ref_norm [cols_pcs ]
469
+ )
470
+ results_ref [adj_col ] = ref_pgs_resid / ref_train_pgs_resid_std
471
+ # Apply to Target Data
472
+ target_pgs_pred = pcs2pgs_fit .predict (target_norm [cols_pcs ])
473
+ target_pgs_resid = target_norm [c_pgs ] - target_pgs_pred
474
+ results_target [adj_col ] = target_pgs_resid / ref_train_pgs_resid_std
475
+ results_models ["adjust_pcs" ]["PGS" ][c_pgs ][
476
+ "Z_norm1"
477
+ ] = package_skl_regression (pcs2pgs_fit )
478
+
479
+ if "mean+var" in use_method :
480
+ # Method 2 (Khan et al. Nature Medicine (2022)): normalize variance (doi:10.1038/s41591-022-01869-1)
481
+ # Normalize based on residual deviation from mean of the distribution [equalize population sds]
482
+ # (e.g. reduce the correlation between genetic ancestry and how far away you are from the mean)
483
+ # USE gamma distribution for predicted variance to constrain it to be positive (b/c using linear
484
+ # regression we can get negative predictions for the sd)
485
+ adj_col = "Z_norm2|{}" .format (c_pgs )
486
+ pcs2var_fit_gamma = GammaRegressor (max_iter = 1000 ).fit (
487
+ ref_train_df [cols_pcs ],
488
+ (ref_train_pgs_resid - ref_train_pgs_resid_mean ) ** 2 ,
496
489
)
490
+ if norm2_2step :
491
+ # Return 2-step adjustment
492
+ results_ref [adj_col ] = ref_pgs_resid / np .sqrt (
493
+ pcs2var_fit_gamma .predict (ref_norm [cols_pcs ])
494
+ )
495
+ results_target [adj_col ] = target_pgs_resid / np .sqrt (
496
+ pcs2var_fit_gamma .predict (target_norm [cols_pcs ])
497
+ )
498
+ results_models ["adjust_pcs" ]["PGS" ][c_pgs ][
499
+ "Z_norm2"
500
+ ] = package_skl_regression (pcs2var_fit_gamma )
501
+ else :
502
+ # Return full-likelihood adjustment model
503
+ # This jointly re-fits the regression parameters from the mean and variance prediction to better
504
+ # fit the observed PGS distribution. It seems to mostly change the intercepts. This implementation is
505
+ # adapted from https://github.com/broadinstitute/palantir-workflows/blob/v0.14/ImputationPipeline/ScoringTasks.wdl,
506
+ # which is distributed under a BDS-3 license.
507
+ params_initial = np .concatenate (
508
+ [
509
+ [pcs2pgs_fit .intercept_ ],
510
+ pcs2pgs_fit .coef_ ,
511
+ [pcs2var_fit_gamma .intercept_ ],
512
+ pcs2var_fit_gamma .coef_ ,
513
+ ]
514
+ )
515
+ pcs2full_fit = fullLL_fit (
516
+ df_score = ref_train_df ,
517
+ scorecol = c_pgs ,
518
+ predictors = cols_pcs ,
519
+ initial_params = params_initial ,
520
+ )
497
521
498
- if pcs2full_fit ["params" ]["success" ] is False :
499
- logger .warning (
500
- "{} full-likelihood: {} {}" .format (
501
- c_pgs ,
502
- pcs2full_fit ["params" ]["status" ],
503
- pcs2full_fit ["params" ]["message" ],
504
- )
522
+ results_ref [adj_col ] = fullLL_adjust (
523
+ pcs2full_fit , ref_norm , c_pgs
524
+ )
525
+ results_target [adj_col ] = fullLL_adjust (
526
+ pcs2full_fit , target_norm , c_pgs
505
527
)
506
- results_models ["adjust_pcs" ]["PGS" ][c_pgs ]["Z_norm2" ] = pcs2full_fit
528
+
529
+ if pcs2full_fit ["params" ]["success" ] is False :
530
+ logger .warning (
531
+ "{} full-likelihood: {} {}" .format (
532
+ c_pgs ,
533
+ pcs2full_fit ["params" ]["status" ],
534
+ pcs2full_fit ["params" ]["message" ],
535
+ )
536
+ )
537
+ results_models ["adjust_pcs" ]["PGS" ][c_pgs ][
538
+ "Z_norm2"
539
+ ] = pcs2full_fit
507
540
# Only return results
508
541
logger .debug ("Outputting adjusted PGS & models" )
509
542
results_ref = pd .DataFrame (results_ref )
0 commit comments