Skip to content

Commit 563a442

Browse files
committed
add polygenicscore to calclib
1 parent 5496494 commit 563a442

File tree

9 files changed

+843
-9
lines changed

9 files changed

+843
-9
lines changed

.github/workflows/calclib-pytest.yml

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
on:
2+
push:
3+
paths:
4+
- 'pgscatalog.calclib/**.py'
5+
pull_request:
6+
paths:
7+
- 'pgscatalog.calclib/**.py'
8+
9+
jobs:
10+
pytest-calclib:
11+
uses: ./.github/workflows/pytest.yaml
12+
with:
13+
package-directory: "pgscatalog.calclib"

pgscatalog.calclib/poetry.lock

+697-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pgscatalog.calclib/poetry.toml

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[virtualenvs]
2+
create = true
3+
in-project = true

pgscatalog.calclib/pyproject.toml

+7
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,15 @@ packages = [
1010

1111
[tool.poetry.dependencies]
1212
python = "^3.11"
13+
"pgscatalog.corelib" = {path = "../pgscatalog.corelib", develop = true}
14+
numpy = "^1.26.4"
15+
pandas = "^2.2.0"
16+
pyarrow = "^15.0.0"
1317

1418

19+
[tool.poetry.group.dev.dependencies]
20+
pytest = "^8.0.0"
21+
1522
[build-system]
1623
requires = ["poetry-core"]
1724
build-backend = "poetry.core.masonry.api"
Original file line numberDiff line numberDiff line change
@@ -1,4 +0,0 @@
1-
from pgscatalog.calclib.testclass import TestClass
2-
3-
# be explicit about public interfaces
4-
__all__ = ["TestClass"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import pathlib
2+
3+
4+
class Config:
5+
ROOT_DIR = pathlib.Path(__file__).resolve().parent.parent.parent.parent
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import pathlib
2+
3+
import pandas as pd
4+
import reprlib
5+
6+
7+
class PolygenicScore:
8+
"""Represents the output of plink2 --score written to a file
9+
10+
>>> from ._config import Config
11+
>>> score1 = Config.ROOT_DIR / "tests" / "cineca_22_additive_0.sscore.zst"
12+
>>> pgs1 = PolygenicScore(sampleset="test", path=score1) # doctest: +ELLIPSIS
13+
>>> pgs1
14+
PolygenicScore(sampleset='test', path=PosixPath('.../cineca_22_additive_0.sscore.zst'), df=None)
15+
>>> pgs2 = PolygenicScore(sampleset="test", path=score1)
16+
>>> pgs1.read().to_dict() # doctest: +ELLIPSIS
17+
{'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 0.54502, ('test', 'HG00097'): 0.674401, ('test', 'HG00099'): 0.63727, ('test', 'HG00100'): 0.863944, ...}}
18+
19+
It's often helpful to combine PGS that were split per chromosome or by effect type:
20+
21+
>>> aggregated_score = pgs1 + pgs2
22+
>>> aggregated_score # doctest: +ELLIPSIS
23+
PolygenicScore(sampleset='test', path=None, df={'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 1.09004, ('test', 'HG00097'): 1.348802, ('test', 'HG00099'): 1.27454, ('test', 'HG00100'): 1.727888, ...}})
24+
25+
Once a score has been fully aggregated it can be helpful to recalculate an average:
26+
27+
>>> aggregated_score.average().to_dict() # doctest: +ELLIPSIS
28+
{'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 1.09004, ...}, 'PGS001229_22_AVG': {('test', 'HG00096'): 0.000348...
29+
30+
Scores can be written to a TSV file:
31+
32+
>>> import tempfile, os
33+
>>> outd = tempfile.mkdtemp()
34+
>>> aggregated_score.write(str(outd))
35+
>>> os.listdir(outd)
36+
['aggregated_scores.txt.gz']
37+
38+
With support for splitting output files by sampleset:
39+
40+
>>> splitoutd = tempfile.mkdtemp()
41+
>>> aggregated_score.write(splitoutd, split=True)
42+
>>> sorted(os.listdir(splitoutd), key = lambda x: x.split("_")[0])
43+
['test_pgs.txt.gz']
44+
"""
45+
46+
def __init__(self, *, sampleset, path=None, df=None):
47+
match (path, df):
48+
case (None, None):
49+
raise ValueError("init with path or df")
50+
case _ if path is not None and df is not None:
51+
raise ValueError("choose one to init: path or df")
52+
case _:
53+
pass
54+
55+
self.path = path
56+
self.df = df
57+
self.sampleset = sampleset
58+
59+
def __repr__(self):
60+
if self.df is not None:
61+
df = reprlib.repr(self.df.to_dict())
62+
else:
63+
df = reprlib.repr(None)
64+
65+
return f"{type(self).__name__}(sampleset={repr(self.sampleset)}, path={repr(self.path)}, df={df})"
66+
67+
def read(self):
68+
if self.df is None:
69+
df = (
70+
pd.read_table(self.path)
71+
.assign(sampleset=self.sampleset)
72+
.set_index(["sampleset", "#IID"])
73+
)
74+
75+
df.index.names = ["sampleset", "IID"]
76+
df = df[_select_agg_cols(df.columns)]
77+
self.df = df
78+
return self.df
79+
80+
def average(self):
81+
avgs = self.df.loc[:, self.df.columns.str.endswith("_SUM")].divide(
82+
self.df["DENOM"], axis=0
83+
)
84+
avgs.columns = avgs.columns.str.replace("_SUM", "_AVG")
85+
self.df = pd.concat([self.df, avgs], axis=1)
86+
return self.df
87+
88+
def write(self, outdir, split=False):
89+
outdir = pathlib.Path(outdir)
90+
if split:
91+
for sampleset, group in self.df.groupby("sampleset"):
92+
fout = outdir / f"{sampleset}_pgs.txt.gz"
93+
group.to_csv(fout, sep="\t", compression="gzip")
94+
else:
95+
fout = outdir / "aggregated_scores.txt.gz"
96+
self.df.to_csv(fout, sep="\t", compression="gzip")
97+
98+
def __add__(self, other):
99+
if isinstance(other, PolygenicScore):
100+
if self.sampleset != other.sampleset:
101+
raise ValueError("Can't add PolygenicScore with different samplesets")
102+
103+
df1 = self.read()
104+
df2 = other.read()
105+
sumdf = df1.add(df2, fill_value=0)
106+
return PolygenicScore(sampleset=self.sampleset, df=sumdf)
107+
else:
108+
return NotImplemented
109+
110+
111+
def _select_agg_cols(cols):
112+
"""Select aggregatable columns"""
113+
keep_cols = ["DENOM"]
114+
return [
115+
x
116+
for x in cols
117+
if (x.endswith("_SUM") and (x != "NAMED_ALLELE_DOSAGE_SUM")) or (x in keep_cols)
118+
]

pgscatalog.calclib/src/pgscatalog/calclib/testclass.py

-2
This file was deleted.
Binary file not shown.

0 commit comments

Comments
 (0)