Skip to content

Commit 1fb949f

Browse files
committed
add writing adjusted scores
1 parent 24dd370 commit 1fb949f

File tree

1 file changed

+125
-17
lines changed

1 file changed

+125
-17
lines changed

pgscatalog.calclib/src/pgscatalog/calclib/polygenicscore.py

+125-17
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,116 @@
1-
import collections
1+
import dataclasses
2+
import gzip
23
import logging
34
import pathlib
5+
import typing
46

57
import pandas as pd
68

7-
from .ancestry.tools import compare_ancestry, choose_pval_threshold, pgs_adjust
9+
from .ancestry.tools import (
10+
compare_ancestry,
11+
choose_pval_threshold,
12+
pgs_adjust,
13+
write_model,
14+
)
815
from .principalcomponents import PopulationType
916
from .ancestry import read
1017

1118

1219
logger = logging.getLogger(__name__)
1320

14-
AdjustArguments = collections.namedtuple(
15-
typename="AdjustArguments",
16-
field_names=["method_compare", "pThreshold", "method_normalization"],
17-
defaults=("RandomForest", None, ["empirical", "mean", "mean+var"]),
18-
)
21+
22+
@dataclasses.dataclass(frozen=True)
23+
class AdjustArguments:
24+
"""Arguments that control genetic similarity estimation and PGS adjustment
25+
26+
>>> AdjustArguments(method_compare="Mahalanobis", pThreshold=None, method_normalization=("empirical", "mean"))
27+
AdjustArguments(method_compare='Mahalanobis', pThreshold=None, method_normalization=('empirical', 'mean'))
28+
"""
29+
30+
method_compare: str = "RandomForest"
31+
pThreshold: typing.Optional[float] = None
32+
method_normalization: tuple[str, ...] = ("empirical", "mean", "mean+var")
33+
34+
def __post_init__(self):
35+
if self.method_compare not in {"Mahalanobis", "RandomForest"}:
36+
raise ValueError(f"Bad method: {self.method_compare}")
37+
38+
if not set(self.method_normalization) <= {"empirical", "mean", "mean+var"}:
39+
raise ValueError(f"Bad normalisation: {self.method_normalization}")
40+
41+
42+
@dataclasses.dataclass(frozen=True)
43+
class AdjustResults:
44+
"""Results returned by the adjust method of a PolygenicScore"""
45+
46+
target_label: str
47+
models: pd.DataFrame
48+
model_meta: dict
49+
pca: pd.DataFrame
50+
pgs: pd.DataFrame
51+
scorecols: list[str]
52+
53+
def write(self, directory):
54+
self._write_model(directory)
55+
self._write_pgs(directory)
56+
self._write_pca(directory)
57+
58+
def _write_pca(self, directory):
59+
directory = pathlib.Path(directory)
60+
loc_popsim_out = directory / f"{self.target_label}_popsimilarity.txt.gz"
61+
logger.debug(f"Writing PCA and popsim results to: {loc_popsim_out}")
62+
self.pca.drop(self.scorecols, axis=1).to_csv(loc_popsim_out, sep="\t")
63+
64+
def _write_model(self, directory):
65+
"""Write results to a directory"""
66+
directory = pathlib.Path(directory)
67+
write_model(
68+
{"pgs": self.models, "compare_pcs": self.model_meta},
69+
str(directory / f"{self.target_label}_info.json.gz"),
70+
)
71+
72+
def _write_pgs(self, directory):
73+
scorecols = self.scorecols
74+
directory = pathlib.Path(directory)
75+
loc_pgs_out = str(directory / f"{self.target_label}_pgs.txt.gz")
76+
with gzip.open(loc_pgs_out, "wt") as outf:
77+
logger.debug(
78+
"Writing adjusted PGS values (long format) to: {}".format(loc_pgs_out)
79+
)
80+
for i, pgs_id in enumerate(scorecols):
81+
df_pgs = self.pgs.loc[:, self.pgs.columns.str.endswith(pgs_id)].melt(
82+
ignore_index=False
83+
) # filter to 1 PGS
84+
df_pgs[["method", "PGS"]] = df_pgs.variable.str.split("|", expand=True)
85+
df_pgs = (
86+
df_pgs.drop("variable", axis=1)
87+
.reset_index()
88+
.pivot(
89+
index=["sampleset", "IID", "PGS"],
90+
columns="method",
91+
values="value",
92+
)
93+
)
94+
if i == 0:
95+
logger.debug(
96+
"{}/{}: Writing {}".format(i + 1, len(scorecols), pgs_id)
97+
)
98+
colorder = list(df_pgs.columns) # to ensure sort order
99+
df_pgs.to_csv(outf, sep="\t")
100+
else:
101+
logger.debug(
102+
"{}/{}: Appending {}".format(i + 1, len(scorecols), pgs_id)
103+
)
104+
df_pgs[colorder].to_csv(outf, sep="\t", header=False)
19105

20106

21107
class AggregatedPGS:
22-
"""A PGS that's been aggregated and melted, and may contain multiple samplesets
108+
"""A PGS that's been aggregated and melted, and may contain a reference panel and a target set
23109
24110
>>> from ._config import Config
25111
>>> score_path = Config.ROOT_DIR / "tests" / "aggregated_scores.txt.gz"
26112
>>> AggregatedPGS(path=score_path, target_name="hgdp")
27113
AggregatedPGS(path=PosixPath('.../pgscatalog.calclib/tests/aggregated_scores.txt.gz'))
28-
29114
"""
30115

31116
def __init__(self, *, target_name, df=None, path=None):
@@ -71,19 +156,30 @@ def _check_overlap(self, ref_pc, target_pc):
71156
raise ValueError
72157

73158
def adjust(self, *, ref_pc, target_pc, adjust_arguments=None):
74-
"""
159+
"""Adjust a PGS based on genetic ancestry similarity.
160+
161+
Adjusting a PGS returns AdjustResults:
162+
75163
>>> from ._config import Config
76164
>>> from .principalcomponents import PrincipalComponents
77165
>>> related_path = Config.ROOT_DIR / "tests" / "ref.king.cutoff.id"
78166
>>> ref_pc = PrincipalComponents(pcs_path=[Config.ROOT_DIR / "tests" / "ref.pcs"], dataset="reference", psam_path=Config.ROOT_DIR / "tests" / "ref.psam", pop_type=PopulationType.REFERENCE, related_path=related_path)
79167
>>> target_pcs = PrincipalComponents(pcs_path=Config.ROOT_DIR / "tests" / "target.pcs", dataset="target", pop_type=PopulationType.TARGET)
80168
>>> score_path = Config.ROOT_DIR / "tests" / "aggregated_scores.txt.gz"
81-
>>> models, adjpgs = AggregatedPGS(path=score_path, target_name="hgdp").adjust(ref_pc=ref_pc, target_pc=target_pcs)
82-
>>> adjpgs.to_dict().keys()
83-
dict_keys(['SUM|PGS001229_hmPOS_GRCh38_SUM', 'SUM|PGS001229_hmPOS_GRCh38_AVG', 'percentile_MostSimilarPop|PGS001229_hmPOS_GRCh38_SUM', 'Z_MostSimilarPop|PGS001229_hmPOS_GRCh38_SUM', 'percentile_MostSimilarPop|PGS001229_hmPOS_GRCh38_AVG', 'Z_MostSimilarPop|PGS001229_hmPOS_GRCh38_AVG', 'Z_norm1|PGS001229_hmPOS_GRCh38_SUM', 'Z_norm2|PGS001229_hmPOS_GRCh38_SUM', 'Z_norm1|PGS001229_hmPOS_GRCh38_AVG', 'Z_norm2|PGS001229_hmPOS_GRCh38_AVG'])
169+
>>> results = AggregatedPGS(path=score_path, target_name="hgdp").adjust(ref_pc=ref_pc, target_pc=target_pcs)
170+
>>> results.pgs.to_dict().keys()
171+
dict_keys(['SUM|PGS001229_hmPOS_GRCh38_SUM', 'SUM|PGS001229_hmPOS_GRCh38_AVG', 'percentile_MostSimilarPop|PGS001229_hmPOS_GRCh38_SUM', ...
84172
85-
>>> models
86-
{'dist_empirical': {'PGS001229_hmPOS_GRCh38_SUM': {'EUR': {'percentiles': array([-1.04069000e+01, -7.94665080e+00, -6.71345760e+00, ...
173+
>>> results.models
174+
{'dist_empirical': {'PGS001229_hmPOS_GRCh38_SUM': {'EUR': {'percentiles': array([-1.04069000e+01, -7.94665080e+00, ...
175+
176+
Write the adjusted results to a directory:
177+
178+
>>> import tempfile, os
179+
>>> dout = tempfile.mkdtemp()
180+
>>> results.write(directory=dout)
181+
>>> sorted(os.listdir(dout))
182+
['target_info.json.gz', 'target_pgs.txt.gz', 'target_popsimilarity.txt.gz']
87183
"""
88184
if adjust_arguments is None:
89185
adjust_arguments = AdjustArguments() # uses default values
@@ -138,12 +234,24 @@ def adjust(self, *, ref_pc, target_pc, adjust_arguments=None):
138234
scorecols,
139235
ref_pc.poplabel,
140236
"MostSimilarPop",
141-
use_method=adjust_arguments.method_normalization,
237+
use_method=list(adjust_arguments.method_normalization),
142238
ref_train_col="Unrelated",
143239
n_pcs=npcs_norm,
144240
)
145241
adjpgs = pd.concat([adjpgs_ref, adjpgs_target], axis=0)
146-
return adjpgs_models, adjpgs
242+
243+
reference_df["REFERENCE"] = True
244+
target_df["REFERENCE"] = False
245+
final_df = pd.concat([target_df, reference_df], axis=0)
246+
247+
return AdjustResults(
248+
target_label=target_pc.dataset,
249+
models=adjpgs_models,
250+
model_meta=compare_info,
251+
pgs=adjpgs,
252+
pca=final_df,
253+
scorecols=scorecols,
254+
)
147255

148256

149257
class PolygenicScore:

0 commit comments

Comments
 (0)