|
1 |
| -import collections |
| 1 | +import dataclasses |
| 2 | +import gzip |
2 | 3 | import logging
|
3 | 4 | import pathlib
|
| 5 | +import typing |
4 | 6 |
|
5 | 7 | import pandas as pd
|
6 | 8 |
|
7 |
| -from .ancestry.tools import compare_ancestry, choose_pval_threshold, pgs_adjust |
| 9 | +from .ancestry.tools import ( |
| 10 | + compare_ancestry, |
| 11 | + choose_pval_threshold, |
| 12 | + pgs_adjust, |
| 13 | + write_model, |
| 14 | +) |
8 | 15 | from .principalcomponents import PopulationType
|
9 | 16 | from .ancestry import read
|
10 | 17 |
|
11 | 18 |
|
12 | 19 | logger = logging.getLogger(__name__)
|
13 | 20 |
|
14 |
| -AdjustArguments = collections.namedtuple( |
15 |
| - typename="AdjustArguments", |
16 |
| - field_names=["method_compare", "pThreshold", "method_normalization"], |
17 |
| - defaults=("RandomForest", None, ["empirical", "mean", "mean+var"]), |
18 |
| -) |
| 21 | + |
| 22 | +@dataclasses.dataclass(frozen=True) |
| 23 | +class AdjustArguments: |
| 24 | + """Arguments that control genetic similarity estimation and PGS adjustment |
| 25 | +
|
| 26 | + >>> AdjustArguments(method_compare="Mahalanobis", pThreshold=None, method_normalization=("empirical", "mean")) |
| 27 | + AdjustArguments(method_compare='Mahalanobis', pThreshold=None, method_normalization=('empirical', 'mean')) |
| 28 | + """ |
| 29 | + |
| 30 | + method_compare: str = "RandomForest" |
| 31 | + pThreshold: typing.Optional[float] = None |
| 32 | + method_normalization: tuple[str, ...] = ("empirical", "mean", "mean+var") |
| 33 | + |
| 34 | + def __post_init__(self): |
| 35 | + if self.method_compare not in {"Mahalanobis", "RandomForest"}: |
| 36 | + raise ValueError(f"Bad method: {self.method_compare}") |
| 37 | + |
| 38 | + if not set(self.method_normalization) <= {"empirical", "mean", "mean+var"}: |
| 39 | + raise ValueError(f"Bad normalisation: {self.method_normalization}") |
| 40 | + |
| 41 | + |
| 42 | +@dataclasses.dataclass(frozen=True) |
| 43 | +class AdjustResults: |
| 44 | + """Results returned by the adjust method of a PolygenicScore""" |
| 45 | + |
| 46 | + target_label: str |
| 47 | + models: pd.DataFrame |
| 48 | + model_meta: dict |
| 49 | + pca: pd.DataFrame |
| 50 | + pgs: pd.DataFrame |
| 51 | + scorecols: list[str] |
| 52 | + |
| 53 | + def write(self, directory): |
| 54 | + self._write_model(directory) |
| 55 | + self._write_pgs(directory) |
| 56 | + self._write_pca(directory) |
| 57 | + |
| 58 | + def _write_pca(self, directory): |
| 59 | + directory = pathlib.Path(directory) |
| 60 | + loc_popsim_out = directory / f"{self.target_label}_popsimilarity.txt.gz" |
| 61 | + logger.debug(f"Writing PCA and popsim results to: {loc_popsim_out}") |
| 62 | + self.pca.drop(self.scorecols, axis=1).to_csv(loc_popsim_out, sep="\t") |
| 63 | + |
| 64 | + def _write_model(self, directory): |
| 65 | + """Write results to a directory""" |
| 66 | + directory = pathlib.Path(directory) |
| 67 | + write_model( |
| 68 | + {"pgs": self.models, "compare_pcs": self.model_meta}, |
| 69 | + str(directory / f"{self.target_label}_info.json.gz"), |
| 70 | + ) |
| 71 | + |
| 72 | + def _write_pgs(self, directory): |
| 73 | + scorecols = self.scorecols |
| 74 | + directory = pathlib.Path(directory) |
| 75 | + loc_pgs_out = str(directory / f"{self.target_label}_pgs.txt.gz") |
| 76 | + with gzip.open(loc_pgs_out, "wt") as outf: |
| 77 | + logger.debug( |
| 78 | + "Writing adjusted PGS values (long format) to: {}".format(loc_pgs_out) |
| 79 | + ) |
| 80 | + for i, pgs_id in enumerate(scorecols): |
| 81 | + df_pgs = self.pgs.loc[:, self.pgs.columns.str.endswith(pgs_id)].melt( |
| 82 | + ignore_index=False |
| 83 | + ) # filter to 1 PGS |
| 84 | + df_pgs[["method", "PGS"]] = df_pgs.variable.str.split("|", expand=True) |
| 85 | + df_pgs = ( |
| 86 | + df_pgs.drop("variable", axis=1) |
| 87 | + .reset_index() |
| 88 | + .pivot( |
| 89 | + index=["sampleset", "IID", "PGS"], |
| 90 | + columns="method", |
| 91 | + values="value", |
| 92 | + ) |
| 93 | + ) |
| 94 | + if i == 0: |
| 95 | + logger.debug( |
| 96 | + "{}/{}: Writing {}".format(i + 1, len(scorecols), pgs_id) |
| 97 | + ) |
| 98 | + colorder = list(df_pgs.columns) # to ensure sort order |
| 99 | + df_pgs.to_csv(outf, sep="\t") |
| 100 | + else: |
| 101 | + logger.debug( |
| 102 | + "{}/{}: Appending {}".format(i + 1, len(scorecols), pgs_id) |
| 103 | + ) |
| 104 | + df_pgs[colorder].to_csv(outf, sep="\t", header=False) |
19 | 105 |
|
20 | 106 |
|
21 | 107 | class AggregatedPGS:
|
22 |
| - """A PGS that's been aggregated and melted, and may contain multiple samplesets |
| 108 | + """A PGS that's been aggregated and melted, and may contain a reference panel and a target set |
23 | 109 |
|
24 | 110 | >>> from ._config import Config
|
25 | 111 | >>> score_path = Config.ROOT_DIR / "tests" / "aggregated_scores.txt.gz"
|
26 | 112 | >>> AggregatedPGS(path=score_path, target_name="hgdp")
|
27 | 113 | AggregatedPGS(path=PosixPath('.../pgscatalog.calclib/tests/aggregated_scores.txt.gz'))
|
28 |
| -
|
29 | 114 | """
|
30 | 115 |
|
31 | 116 | def __init__(self, *, target_name, df=None, path=None):
|
@@ -71,19 +156,30 @@ def _check_overlap(self, ref_pc, target_pc):
|
71 | 156 | raise ValueError
|
72 | 157 |
|
73 | 158 | def adjust(self, *, ref_pc, target_pc, adjust_arguments=None):
|
74 |
| - """ |
| 159 | + """Adjust a PGS based on genetic ancestry similarity. |
| 160 | +
|
| 161 | + Adjusting a PGS returns AdjustResults: |
| 162 | +
|
75 | 163 | >>> from ._config import Config
|
76 | 164 | >>> from .principalcomponents import PrincipalComponents
|
77 | 165 | >>> related_path = Config.ROOT_DIR / "tests" / "ref.king.cutoff.id"
|
78 | 166 | >>> ref_pc = PrincipalComponents(pcs_path=[Config.ROOT_DIR / "tests" / "ref.pcs"], dataset="reference", psam_path=Config.ROOT_DIR / "tests" / "ref.psam", pop_type=PopulationType.REFERENCE, related_path=related_path)
|
79 | 167 | >>> target_pcs = PrincipalComponents(pcs_path=Config.ROOT_DIR / "tests" / "target.pcs", dataset="target", pop_type=PopulationType.TARGET)
|
80 | 168 | >>> score_path = Config.ROOT_DIR / "tests" / "aggregated_scores.txt.gz"
|
81 |
| - >>> models, adjpgs = AggregatedPGS(path=score_path, target_name="hgdp").adjust(ref_pc=ref_pc, target_pc=target_pcs) |
82 |
| - >>> adjpgs.to_dict().keys() |
83 |
| - dict_keys(['SUM|PGS001229_hmPOS_GRCh38_SUM', 'SUM|PGS001229_hmPOS_GRCh38_AVG', 'percentile_MostSimilarPop|PGS001229_hmPOS_GRCh38_SUM', 'Z_MostSimilarPop|PGS001229_hmPOS_GRCh38_SUM', 'percentile_MostSimilarPop|PGS001229_hmPOS_GRCh38_AVG', 'Z_MostSimilarPop|PGS001229_hmPOS_GRCh38_AVG', 'Z_norm1|PGS001229_hmPOS_GRCh38_SUM', 'Z_norm2|PGS001229_hmPOS_GRCh38_SUM', 'Z_norm1|PGS001229_hmPOS_GRCh38_AVG', 'Z_norm2|PGS001229_hmPOS_GRCh38_AVG']) |
| 169 | + >>> results = AggregatedPGS(path=score_path, target_name="hgdp").adjust(ref_pc=ref_pc, target_pc=target_pcs) |
| 170 | + >>> results.pgs.to_dict().keys() |
| 171 | + dict_keys(['SUM|PGS001229_hmPOS_GRCh38_SUM', 'SUM|PGS001229_hmPOS_GRCh38_AVG', 'percentile_MostSimilarPop|PGS001229_hmPOS_GRCh38_SUM', ... |
84 | 172 |
|
85 |
| - >>> models |
86 |
| - {'dist_empirical': {'PGS001229_hmPOS_GRCh38_SUM': {'EUR': {'percentiles': array([-1.04069000e+01, -7.94665080e+00, -6.71345760e+00, ... |
| 173 | + >>> results.models |
| 174 | + {'dist_empirical': {'PGS001229_hmPOS_GRCh38_SUM': {'EUR': {'percentiles': array([-1.04069000e+01, -7.94665080e+00, ... |
| 175 | +
|
| 176 | + Write the adjusted results to a directory: |
| 177 | +
|
| 178 | + >>> import tempfile, os |
| 179 | + >>> dout = tempfile.mkdtemp() |
| 180 | + >>> results.write(directory=dout) |
| 181 | + >>> sorted(os.listdir(dout)) |
| 182 | + ['target_info.json.gz', 'target_pgs.txt.gz', 'target_popsimilarity.txt.gz'] |
87 | 183 | """
|
88 | 184 | if adjust_arguments is None:
|
89 | 185 | adjust_arguments = AdjustArguments() # uses default values
|
@@ -138,12 +234,24 @@ def adjust(self, *, ref_pc, target_pc, adjust_arguments=None):
|
138 | 234 | scorecols,
|
139 | 235 | ref_pc.poplabel,
|
140 | 236 | "MostSimilarPop",
|
141 |
| - use_method=adjust_arguments.method_normalization, |
| 237 | + use_method=list(adjust_arguments.method_normalization), |
142 | 238 | ref_train_col="Unrelated",
|
143 | 239 | n_pcs=npcs_norm,
|
144 | 240 | )
|
145 | 241 | adjpgs = pd.concat([adjpgs_ref, adjpgs_target], axis=0)
|
146 |
| - return adjpgs_models, adjpgs |
| 242 | + |
| 243 | + reference_df["REFERENCE"] = True |
| 244 | + target_df["REFERENCE"] = False |
| 245 | + final_df = pd.concat([target_df, reference_df], axis=0) |
| 246 | + |
| 247 | + return AdjustResults( |
| 248 | + target_label=target_pc.dataset, |
| 249 | + models=adjpgs_models, |
| 250 | + model_meta=compare_info, |
| 251 | + pgs=adjpgs, |
| 252 | + pca=final_df, |
| 253 | + scorecols=scorecols, |
| 254 | + ) |
147 | 255 |
|
148 | 256 |
|
149 | 257 | class PolygenicScore:
|
|
0 commit comments