Skip to content

Commit d700be2

Browse files
committed
check log counts
1 parent 850b297 commit d700be2

File tree

3 files changed

+97
-41
lines changed

3 files changed

+97
-41
lines changed

pgscatalog.matchlib/src/pgscatalog/matchlib/_match/label.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ def _label_flips(df: pl.LazyFrame, skip_flip: bool) -> pl.LazyFrame:
302302
)
303303
else:
304304
logger.debug("Not excluding flipped matches")
305-
return df
305+
return df.with_columns(match_IDs=pl.lit("NA"))
306306

307307

308308
def _label_filter(df: pl.LazyFrame, filter_IDs: list) -> pl.LazyFrame:

pgscatalog.matchlib/src/pgscatalog/matchlib/_match/log.py

+33-35
Original file line numberDiff line numberDiff line change
@@ -21,53 +21,53 @@ def make_summary_log(
2121
filter_summary: pl.LazyFrame,
2222
dataset: str,
2323
) -> pl.LazyFrame:
24-
"""Make an aggregated table"""
24+
"""Make an aggregated table that contains the best match candidate for each row in the original scoring file"""
2525
logger.debug("Aggregating best matches into a summary table")
26+
cols = [
27+
"dataset",
28+
"accession",
29+
"ambiguous",
30+
"is_multiallelic",
31+
"match_flipped",
32+
"duplicate_best_match",
33+
"duplicate_ID",
34+
"match_IDs",
35+
"match_status",
36+
]
37+
2638
best_matches: pl.LazyFrame = match_candidates.filter(pl.col("best_match"))
39+
2740
return (
2841
scorefile.join(best_matches, on=["row_nr", "accession"], how="outer")
29-
.select(pl.exclude("^.*_right$"))
3042
.with_columns(
31-
[
32-
pl.col("match_status").fill_null(value="unmatched"),
33-
pl.lit(dataset).alias("dataset"),
34-
]
43+
pl.col("match_status").fill_null(value="unmatched"), dataset=pl.lit(dataset)
3544
) # fill in unmatched variants
36-
.groupby(
37-
[
38-
"dataset",
39-
"accession",
40-
"ambiguous",
41-
"is_multiallelic",
42-
"match_flipped",
43-
"duplicate_best_match",
44-
"duplicate_ID",
45-
"match_IDs",
46-
"match_status",
47-
]
48-
)
49-
.agg(pl.all().len())
45+
.group_by(cols)
46+
.count()
5047
.join(filter_summary, how="left", on="accession")
5148
.pipe(_prettify_summary)
5249
)
5350

5451

55-
def check_log_count(scorefile: pl.LazyFrame, summary_log: pl.LazyFrame) -> None:
52+
def check_log_count(scorefile: pl.LazyFrame, summary_log: pl.LazyFrame):
5653
"""Check aggregated counts vs original from scoring file"""
57-
summary_count: pl.DataFrame = (
58-
summary_log.groupby(pl.col("accession")).agg(pl.sum("count"))
59-
).collect()
54+
summary_count: pl.LazyFrame = summary_log.group_by("accession").agg(pl.sum("count"))
55+
6056
log_count: pl.DataFrame = (
61-
scorefile.groupby("accession")
62-
.agg(pl.all().len())
63-
.collect()
57+
scorefile.group_by("accession")
58+
.count()
6459
.join(summary_count, on="accession")
60+
.collect()
6561
)
6662

67-
assert (
68-
log_count.get_column("count") == log_count.get_column("count_right")
69-
).all(), "Log doesn't match input scoring file"
70-
logger.debug("Log matches input scoring file")
63+
for row in log_count.iter_rows():
64+
if (log_count := row[1]) != (score_count := row[2]):
65+
logger.critical("Variant log doesn't match input scoring file counts")
66+
raise ValueError(
67+
f"{row[0]} match failure {log_count=} doesn't match {score_count=}"
68+
)
69+
70+
return True
7171

7272

7373
def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame:
@@ -118,10 +118,8 @@ def _prettify_log(df: pl.LazyFrame) -> pl.LazyFrame:
118118
"match_status",
119119
"dataset",
120120
]
121-
pretty_df = (
122-
df.select(keep_cols)
123-
.select(pl.exclude("^.*_right"))
124-
.sort(["accession", "row_nr", "chr_name", "chr_position", "match_status"])
121+
pretty_df = df.select(keep_cols).sort(
122+
["accession", "row_nr", "chr_name", "chr_position", "match_status"]
125123
)
126124
return pretty_df
127125

pgscatalog.matchlib/src/pgscatalog/matchlib/matchresult.py

+63-5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from ._plinkframe import PlinkFrames
99
from ._match.label import label_matches
1010
from ._match.filter import filter_scores
11+
from ._match.log import make_logs, check_log_count, make_summary_log
1112

1213
logger = logging.getLogger(__name__)
1314

@@ -136,6 +137,22 @@ class MatchResults(collections.abc.Sequence):
136137
>>> sum("additive" in f for f in scorefiles)
137138
19
138139
>>> assert len(scorefiles) == 21
140+
141+
An important part of matching variants is reporting a log:
142+
>>> with scorefile as score_df:
143+
... MatchResults(x).full_variant_log(score_df).fetch(2)
144+
shape: (3, 23)
145+
┌────────┬───────────┬──────────┬────────────┬───┬────────────┬───────────┬────────────┬───────────┐
146+
│ row_nr ┆ accession ┆ chr_name ┆ chr_positi ┆ … ┆ duplicate_ ┆ match_IDs ┆ match_stat ┆ dataset │
147+
│ --- ┆ --- ┆ --- ┆ on ┆ ┆ ID ┆ --- ┆ us ┆ --- │
148+
│ u64 ┆ cat ┆ cat ┆ --- ┆ ┆ --- ┆ null ┆ --- ┆ cat │
149+
│ ┆ ┆ ┆ u64 ┆ ┆ bool ┆ ┆ cat ┆ │
150+
╞════════╪═══════════╪══════════╪════════════╪═══╪════════════╪═══════════╪════════════╪═══════════╡
151+
│ null ┆ null ┆ null ┆ null ┆ … ┆ false ┆ null ┆ matched ┆ goodmatch │
152+
│ 0 ┆ PGS000002 ┆ 11 ┆ 69331418 ┆ … ┆ false ┆ null ┆ matched ┆ goodmatch │
153+
│ 77 ┆ PGS000002 ┆ 11 ┆ 69331418 ┆ … ┆ null ┆ null ┆ unmatched ┆ goodmatch │
154+
└────────┴───────────┴──────────┴────────────┴───┴────────────┴───────────┴────────────┴───────────┘
155+
139156
"""
140157

141158
def __init__(self, *elements):
@@ -149,7 +166,14 @@ def __init__(self, *elements):
149166
self.dataset = self._elements[0].dataset
150167
# a df composed of all match result elements
151168
self.df = pl.scan_ipc(x.ipc_path for x in self._elements)
169+
# a summary table containing match rates
170+
self.filter_summary = None
171+
# have match candidates in df been labelled?
152172
self._labelled = False
173+
# have match candidates in df been filtered?
174+
self._filtered = False
175+
# does the input scoring file count match the variant log count?
176+
self._log_OK = None
153177

154178
def __len__(self):
155179
return len(self._elements)
@@ -189,24 +213,58 @@ def write_scorefiles(self, directory, score_df, split=False, **kwargs):
189213
if not self._labelled:
190214
self.df = self.label(**kwargs)
191215

192-
self.df, self.score_summary = filter_scores(
216+
self.df, self.filter_summary = filter_scores(
193217
scorefile=score_df,
194218
matches=self.df,
195219
min_overlap=kwargs.get("min_overlap", 0.75),
196220
dataset=self.dataset,
197221
)
198-
self.filtered = True
222+
self._filtered = True
199223

200-
if self.score_summary.is_empty():
224+
# a filter summary contains match rates for each accession, and a column
225+
# indicating if the score passes the minimum matching threshold
226+
if self.filter_summary.is_empty():
201227
# can happen if min_overlap = 0
202228
raise ZeroMatchesError(
203229
"Error: no target variants match any variants in scoring files"
204230
)
205231

232+
# a summary log contains up to one variant (the best match) for each variant
233+
# in the scoring file
234+
self.summary_log = make_summary_log(
235+
match_candidates=self.df,
236+
dataset=self.dataset,
237+
filter_summary=self.filter_summary.lazy(),
238+
scorefile=score_df,
239+
)
240+
241+
# double check log count vs scoring file variant count
242+
self._log_OK = check_log_count(scorefile=score_df, summary_log=self.summary_log)
243+
206244
plink = PlinkFrames.from_matchresult(self.df)
207245

208246
for frame in plink:
209247
frame.write(directory=directory, split=split, dataset=self.dataset)
210248

211-
def full_variant_log(self):
212-
raise NotImplementedError
249+
def full_variant_log(self, score_df, **kwargs):
250+
"""Generate a log for each variant in a scoring file
251+
252+
Multiple match candidates may exist for each variant in the original file.
253+
Describe each variant (one variant per row) with match metadata
254+
"""
255+
if not self._labelled:
256+
self.df = self.label(**kwargs)
257+
self._labelled = True
258+
259+
if not self._filtered:
260+
self.df, self.filter_summary = filter_scores(
261+
scorefile=score_df,
262+
matches=self.df,
263+
min_overlap=kwargs.get("min_overlap", 0.75),
264+
dataset=self.dataset,
265+
)
266+
self._filtered = True
267+
268+
return make_logs(
269+
scorefile=score_df, dataset=self.dataset, match_candidates=self.df
270+
)

0 commit comments

Comments
 (0)