1
1
import pathlib
2
2
3
3
import pandas as pd
4
- import reprlib
5
4
6
5
7
6
class PolygenicScore :
8
7
"""Represents the output of plink2 --score written to a file
9
8
10
9
>>> from ._config import Config
11
10
>>> score1 = Config.ROOT_DIR / "tests" / "cineca_22_additive_0.sscore.zst"
12
- >>> pgs1 = PolygenicScore(sampleset="test", path=score1) # doctest: +ELLIPSIS
11
+ >>> pgs1 = PolygenicScore(path=score1) # doctest: +ELLIPSIS
13
12
>>> pgs1
14
- PolygenicScore(sampleset='test', path=PosixPath('.../cineca_22_additive_0.sscore.zst'), df=None)
15
- >>> pgs2 = PolygenicScore(sampleset="test", path=score1)
16
- >>> pgs1.read().to_dict() # doctest: +ELLIPSIS
17
- {'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 0.54502, ('test', 'HG00097'): 0.674401, ('test', 'HG00099'): 0.63727, ('test', 'HG00100'): 0.863944, ...}}
13
+ PolygenicScore(sampleset='cineca', path=PosixPath('.../cineca_22_additive_0.sscore.zst'))
14
+ >>> pgs2 = PolygenicScore(path=score1)
18
15
19
16
It's often helpful to combine PGS that were split per chromosome or by effect type:
20
17
21
18
>>> aggregated_score = pgs1 + pgs2
22
19
>>> aggregated_score # doctest: +ELLIPSIS
23
- PolygenicScore(sampleset='test', path=None, df={'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 1.09004, ('test', 'HG00097'): 1.348802, ('test', 'HG00099'): 1.27454, ('test', 'HG00100'): 1.727888, ...}})
20
+ PolygenicScore(sampleset='cineca', path=None)
21
+
22
+ The backing dataframe is loaded lazily in chunks:
23
+
24
+ >>> for chunk in aggregated_score:
25
+ ... chunk.to_dict()
26
+ ... break
27
+ {'DENOM': {('cineca', 'HG00096'): 3128, ...}, 'PGS001229_22_SUM': {('cineca', 'HG00096'): 1.09004, ...}}
28
+
24
29
25
30
Once a score has been fully aggregated it can be helpful to recalculate an average:
26
31
27
32
>>> aggregated_score.average().to_dict() # doctest: +ELLIPSIS
28
- {'DENOM': ...}, 'PGS001229_22_SUM': {('test ', 'HG00096'): 1.09004, ...}, 'PGS001229_22_AVG': {('test ', 'HG00096'): 0.000348...
33
+ {'DENOM': ...}, 'PGS001229_22_SUM': {('cineca ', 'HG00096'): 1.09004, ...}, 'PGS001229_22_AVG': {('cineca ', 'HG00096'): 0.000348...
29
34
30
35
Scores can be written to a TSV file:
31
36
@@ -40,10 +45,10 @@ class PolygenicScore:
40
45
>>> splitoutd = tempfile.mkdtemp()
41
46
>>> aggregated_score.write(splitoutd, split=True)
42
47
>>> sorted(os.listdir(splitoutd), key = lambda x: x.split("_")[0])
43
- ['test_pgs .txt.gz']
48
+ ['cineca_pgs .txt.gz']
44
49
"""
45
50
46
- def __init__ (self , * , sampleset , path = None , df = None ):
51
+ def __init__ (self , * , path = None , df = None , sampleset = None ):
47
52
match (path , df ):
48
53
case (None , None ):
49
54
raise ValueError ("init with path or df" )
@@ -52,62 +57,102 @@ def __init__(self, *, sampleset, path=None, df=None):
52
57
case _:
53
58
pass
54
59
55
- self .path = path
56
- self .df = df
57
- self .sampleset = sampleset
60
+ try :
61
+ self .path = pathlib .Path (path )
62
+ except TypeError :
63
+ self .path = None
64
+
65
+ if sampleset is None :
66
+ self .sampleset = path .stem .split ("_" )[0 ]
67
+ else :
68
+ self .sampleset = sampleset
69
+
70
+ self ._chunksize = 50000
71
+
72
+ if df is not None :
73
+ # big df is an in-memory pandas df
74
+ self ._bigdf = df
75
+ else :
76
+ self ._bigdf = None
77
+ self ._df = None
58
78
59
79
def __repr__ (self ):
60
- if self .df is not None :
61
- df = reprlib .repr (self .df .to_dict ())
80
+ return f"{ type (self ).__name__ } (sampleset={ repr (self .sampleset )} , path={ repr (self .path )} )"
81
+
82
+ def __iter__ (self ):
83
+ yield from self .df
84
+
85
+ def __add__ (self , other ):
86
+ if isinstance (other , PolygenicScore ):
87
+ dfs = []
88
+ for df1 , df2 in zip (self , other , strict = True ):
89
+ sumdf = df1 .add (df2 , fill_value = 0 )
90
+ dfs .append (sumdf )
91
+ return PolygenicScore (sampleset = self .sampleset , df = pd .concat (dfs , axis = 0 ))
62
92
else :
63
- df = reprlib . repr ( None )
93
+ return NotImplemented
64
94
65
- return f"{ type (self ).__name__ } (sampleset={ repr (self .sampleset )} , path={ repr (self .path )} , df={ df } )"
95
+ @property
96
+ def df (self ):
97
+ if self .path is not None :
98
+ self ._df = self .lazy_read ()
99
+ elif self ._bigdf is not None :
100
+ # a fake generator
101
+ self ._df = (x for x in [self ._bigdf ])
102
+ return self ._df
66
103
67
- def read (self ):
68
- """Read a PGS file as a pandas dataframe"""
69
- if self .df is None :
70
- df = (
71
- pd .read_table (self .path )
72
- .assign (sampleset = self .sampleset )
73
- .set_index (["sampleset" , "#IID" ])
74
- )
104
+ def lazy_read (self ):
105
+ """Lazily read a PGS in chunks"""
106
+ if self .path is None :
107
+ raise ValueError ("Missing path" )
108
+
109
+ for chunk in pd .read_csv (self .path , chunksize = self ._chunksize , sep = "\t " ):
110
+ df = chunk .assign (sampleset = self .sampleset ).set_index (["sampleset" , "#IID" ])
75
111
76
112
df .index .names = ["sampleset" , "IID" ]
77
113
df = df [_select_agg_cols (df .columns )]
78
- self .df = df
79
- return self .df
114
+ yield df
80
115
81
- def average (self ):
82
- avgs = self .df .loc [:, self .df .columns .str .endswith ("_SUM" )].divide (
83
- self .df ["DENOM" ], axis = 0
116
+ def read (self ):
117
+ """Eagerly load a PGS into a pandas dataframe"""
118
+ if self .path is None :
119
+ raise ValueError ("Missing path" )
120
+
121
+ df = (
122
+ pd .read_csv (self .path , sep = "\t " )
123
+ .assign (sampleset = self .sampleset )
124
+ .set_index (["sampleset" , "#IID" ])
84
125
)
85
- avgs .columns = avgs .columns .str .replace ("_SUM" , "_AVG" )
86
- self .df = pd .concat ([self .df , avgs ], axis = 1 )
87
- return self .df
126
+
127
+ df .index .names = ["sampleset" , "IID" ]
128
+ df = df [_select_agg_cols (df .columns )]
129
+ return df
88
130
89
131
def write (self , outdir , split = False ):
90
- """Write a PGS to a compressed TSV"""
132
+ """Write PGS to a compressed TSV"""
91
133
outdir = pathlib .Path (outdir )
92
- if split :
93
- for sampleset , group in self .df .groupby ("sampleset" ):
94
- fout = outdir / f"{ sampleset } _pgs.txt.gz"
95
- group .to_csv (fout , sep = "\t " , compression = "gzip" )
96
- else :
97
- fout = outdir / "aggregated_scores.txt.gz"
98
- self .df .to_csv (fout , sep = "\t " , compression = "gzip" )
134
+ for chunk in self :
135
+ if split :
136
+ for sampleset , group in chunk .groupby ("sampleset" ):
137
+ fout = outdir / f"{ sampleset } _pgs.txt.gz"
138
+ chunk .to_csv (fout , sep = "\t " , compression = "gzip" , mode = "a" )
139
+ else :
140
+ fout = outdir / "aggregated_scores.txt.gz"
141
+ chunk .to_csv (fout , sep = "\t " , compression = "gzip" , mode = "a" )
99
142
100
- def __add__ (self , other ):
101
- if isinstance (other , PolygenicScore ):
102
- if self .sampleset != other .sampleset :
103
- raise ValueError ("Can't add PolygenicScore with different samplesets" )
104
-
105
- df1 = self .read ()
106
- df2 = other .read ()
107
- sumdf = df1 .add (df2 , fill_value = 0 )
108
- return PolygenicScore (sampleset = self .sampleset , df = sumdf )
109
- else :
110
- return NotImplemented
143
+ def average (self ):
144
+ """Recalculate average.
145
+
146
+ This is an eager operation, and immediately returns a dataframe
147
+ """
148
+ chunk_list = []
149
+ for chunk in self :
150
+ avgs = chunk .loc [:, chunk .columns .str .endswith ("_SUM" )].divide (
151
+ chunk ["DENOM" ], axis = 0
152
+ )
153
+ avgs .columns = avgs .columns .str .replace ("_SUM" , "_AVG" )
154
+ chunk_list .append (pd .concat ([chunk , avgs ], axis = 1 ))
155
+ return pd .concat (chunk_list , axis = 0 )
111
156
112
157
113
158
def _select_agg_cols (cols ):
0 commit comments