Skip to content

Commit daf2d3f

Browse files
committed
add support for writing --split and --combined
1 parent 59948e7 commit daf2d3f

File tree

5 files changed

+57
-8
lines changed

5 files changed

+57
-8
lines changed

pgscatalog.matchapp/src/pgscatalog/matchapp/_write.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
It expects Config class attributes to be set up before being called
44
"""
55
import gzip
6+
import itertools
7+
8+
from pgscatalog.matchlib.plinkscorefiles import PlinkScoreFiles
69

710
from ._config import Config
811

@@ -11,11 +14,19 @@ def write_matches(matchresults, score_df):
1114
"""Write matchresults out to scoring files and logs"""
1215
match (Config.SPLIT, Config.COMBINED):
1316
case (True, True):
14-
# requires extra work: first write split, then re-combine without recomputing matches
15-
raise NotImplementedError
17+
# requires extra work: first write split
18+
outfs = matchresults.write_scorefiles(
19+
directory=Config.OUTDIR,
20+
split=True,
21+
score_df=score_df,
22+
min_overlap=Config.MIN_OVERLAP,
23+
**Config.MATCH_PARAMS,
24+
)
25+
# now re-combine without recomputing matches
26+
PlinkScoreFiles(*list(itertools.chain(*outfs))).merge(Config.OUTDIR)
1627
case (True, False) | (False, True):
1728
# split parameter can handle this case OK
18-
matchresults.write_scorefiles(
29+
_ = matchresults.write_scorefiles(
1930
directory=Config.OUTDIR,
2031
split=Config.SPLIT,
2132
score_df=score_df,

pgscatalog.matchapp/tests/test_merge_cli.py

+32
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,38 @@ def test_split_output(tmp_path_factory, good_scorefile, match_ipc):
8787
assert sum("recessive" in x for x in splitf) == 1
8888

8989

90+
def test_splitcombine_output(tmp_path_factory, good_scorefile, match_ipc):
91+
"""Test merging runs without errors with good data outputting both combined and split scorefiles"""
92+
outdir = tmp_path_factory.mktemp("outdir")
93+
94+
args = [
95+
(
96+
"pgscatalog-matchmerge",
97+
"-d",
98+
"test",
99+
"-s",
100+
str(good_scorefile),
101+
"--matches",
102+
str(match_ipc),
103+
"--outdir",
104+
str(outdir),
105+
"--min_overlap",
106+
str(0.75),
107+
"--split",
108+
"--combined",
109+
)
110+
]
111+
flargs = list(itertools.chain(*args))
112+
113+
with patch("sys.argv", flargs):
114+
run_merge()
115+
splitf = glob(str(outdir / "*scorefile.gz"))
116+
assert len(splitf) == 21 + 3 # split + combined
117+
assert sum("additive" in x for x in splitf) == 19 + 1
118+
assert sum("dominant" in x for x in splitf) == 1 + 1
119+
assert sum("recessive" in x for x in splitf) == 1 + 1
120+
121+
90122
def test_strict_merge(tmp_path_factory, good_scorefile, match_ipc):
91123
"""Test merging with extremely strict overlap to trigger a MatchRateError"""
92124
outdir = tmp_path_factory.mktemp("outdir")

pgscatalog.matchlib/src/pgscatalog/matchlib/_plinkframe.py

+4
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def pivot_wide(self):
3737

3838
def write(self, directory, dataset, split=False):
3939
"""Write a plink2 --score compatible file, optionally splitting"""
40+
fouts = []
4041
if split:
4142
dfs = self.split_pivot()
4243
for chrom, df in dfs.items():
@@ -46,6 +47,7 @@ def write(self, directory, dataset, split=False):
4647
)
4748
with gzip.open(fout, "wb") as f:
4849
df.write_csv(f, separator="\t")
50+
fouts.append(fout)
4951
else:
5052
chrom = "ALL"
5153
fout = (
@@ -55,6 +57,8 @@ def write(self, directory, dataset, split=False):
5557
df = self.pivot_wide()
5658
with gzip.open(fout, "wb") as f:
5759
df.write_csv(f, separator="\t")
60+
fouts.append(fout)
61+
return fouts
5862

5963

6064
class PlinkFrames(collections.abc.Sequence):

pgscatalog.matchlib/src/pgscatalog/matchlib/matchresult.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,8 @@ class MatchResults(collections.abc.Sequence):
122122
123123
>>> with scorefile as score_df:
124124
... x = MatchResult.from_ipc(fout.name, dataset="goodmatch")
125-
... MatchResults(x).write_scorefiles(directory=foutdir, score_df=score_df)
126-
... MatchResults(x).write_scorefiles(directory=splitfoutdir, split=True, score_df=score_df)
125+
... _ = MatchResults(x).write_scorefiles(directory=foutdir, score_df=score_df)
126+
... _ = MatchResults(x).write_scorefiles(directory=splitfoutdir, split=True, score_df=score_df)
127127
>>> MatchResults(x) # doctest: +ELLIPSIS
128128
MatchResults([MatchResult(dataset=goodmatch, matchresult=None, ipc_path=...])
129129
@@ -275,12 +275,14 @@ def write_scorefiles(
275275
self._log_OK = check_log_count(scorefile=score_df, summary_log=self.summary_log)
276276

277277
plink = PlinkFrames.from_matchresult(self.df)
278-
278+
outfs = []
279279
for frame in plink:
280-
frame.write(directory=directory, split=split, dataset=self.dataset)
280+
f = frame.write(directory=directory, split=split, dataset=self.dataset)
281+
outfs.append(f)
281282

282283
# collect after joining in check_log_count (can't join df and lazy df)
283284
self.summary_log = self.summary_log.collect()
285+
return outfs
284286

285287
def full_variant_log(self, score_df, **kwargs):
286288
"""Generate a log for each variant in a scoring file

pgscatalog.matchlib/src/pgscatalog/matchlib/plinkscorefiles.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def merge(self, directory):
4545
>>> x = MatchResult.from_ipc(fout.name, dataset="goodmatch")
4646
>>> foutdir = tempfile.mkdtemp()
4747
>>> with scorefile as score_df:
48-
... MatchResults(x).write_scorefiles(directory=foutdir, split=True, score_df=score_df) # doctest: +ELLIPSIS
48+
... _ = MatchResults(x).write_scorefiles(directory=foutdir, split=True, score_df=score_df) # doctest: +ELLIPSIS
4949
>>> plink_files = (pathlib.Path(foutdir) / x for x in os.listdir(foutdir))
5050
>>> psf = PlinkScoreFiles(*plink_files)
5151
>>> psf # doctest: +ELLIPSIS

0 commit comments

Comments
 (0)