Skip to content

Commit d386775

Browse files
committed
fix missing log information
1 parent 96a495b commit d386775

File tree

2 files changed

+71
-54
lines changed

2 files changed

+71
-54
lines changed

pgscatalog.match/src/pgscatalog/match/cli/_write.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def write_matches(matchresults, score_df):
4747
write_log(matchresults=matchresults, score_df=score_df)
4848

4949
# returns labelled and filtered data for checking after merging
50-
return matchresults.df
50+
return matchresults.filtered_matches
5151

5252

5353
def write_log(matchresults, score_df):

pgscatalog.match/src/pgscatalog/match/lib/matchresult.py

+70-53
Original file line numberDiff line numberDiff line change
@@ -152,20 +152,20 @@ class MatchResults(collections.abc.Sequence):
152152
153153
>>> with pl.Config(tbl_formatting="ASCII_MARKDOWN", tbl_hide_column_data_types=True, tbl_width_chars=120), scorefile as score_df:
154154
... MatchResults(x).full_variant_log(score_df).fetch() # +ELLIPSIS
155-
shape: (155, 23)
155+
shape: (169, 23)
156156
| row_nr | accession | chr_name | chr_position | … | duplicate_ID | match_IDs | match_status | dataset |
157157
|--------|-----------|----------|--------------|---|--------------|-----------|--------------|-----------|
158-
| 0 | PGS000002 | 11 | 69331418 | … | null | null | unmatched | goodmatch |
158+
| 0 | PGS000002 | 11 | 69331418 | … | true | NA | excluded | goodmatch |
159159
| 1 | PGS000002 | 11 | 69379161 | … | false | NA | matched | goodmatch |
160-
| 2 | PGS000002 | 11 | 69331642 | … | null | null | unmatched | goodmatch |
160+
| 2 | PGS000002 | 11 | 69331642 | … | false | NA | excluded | goodmatch |
161+
| 2 | PGS000002 | 11 | 69331642 | … | false | NA | not_best | goodmatch |
161162
| 3 | PGS000002 | 5 | 1282319 | … | false | NA | matched | goodmatch |
162-
| 4 | PGS000002 | 5 | 1279790 | … | false | NA | matched | goodmatch |
163163
| … | … | … | … | … | … | … | … | … |
164-
| 72 | PGS000001 | 22 | 40876234 | … | false | NA | matched | goodmatch |
165164
| 73 | PGS000001 | 1 | 204518842 | … | false | NA | matched | goodmatch |
166165
| 74 | PGS000001 | 1 | 202187176 | … | false | NA | matched | goodmatch |
167166
| 75 | PGS000001 | 2 | 19320803 | … | false | NA | matched | goodmatch |
168-
| 76 | PGS000001 | 16 | 53855291 | … | null | null | unmatched | goodmatch |
167+
| 76 | PGS000001 | 16 | 53855291 | … | false | NA | excluded | goodmatch |
168+
| 76 | PGS000001 | 16 | 53855291 | … | false | NA | not_best | goodmatch |
169169
"""
170170

171171
def __init__(self, *elements):
@@ -177,19 +177,18 @@ def __init__(self, *elements):
177177
)
178178

179179
self.dataset = self._elements[0].dataset
180+
180181
# a df composed of all match result elements
181-
self.df = pl.scan_ipc(x.ipc_path for x in self._elements)
182+
self._df = pl.scan_ipc(x.ipc_path for x in self._elements)
182183
if self.df.select("row_nr").collect().is_empty():
183184
raise ZeroMatchesError("No match candidates found for any scoring files")
184185

185-
# a table containing up to one row per variant (the best possible match)
186-
self.filter_summary = None
187-
# a summary log containing match rates for variants
188-
self.summary_log = None
189-
# have match candidates in df been labelled?
190-
self._labelled = False
191-
# have match candidates in df been filtered?
192-
self._filtered = False
186+
# internal dataframes
187+
self._match_candidates = None
188+
self._filtered_matches = None
189+
self._filter_summary = None
190+
self._summary_log = None
191+
193192
# does the input scoring file count match the variant log count?
194193
self._log_OK = None
195194

@@ -202,6 +201,31 @@ def __getitem__(self, item):
202201
def __repr__(self):
203202
return f"{type(self).__name__}({self._elements!r})"
204203

204+
@property
205+
def df(self) -> pl.LazyFrame:
206+
"""A df containing raw match results"""
207+
return self._df
208+
209+
@property
210+
def match_candidates(self) -> pl.LazyFrame:
211+
"""A df containing all possible matches for each input score variant"""
212+
return self._match_candidates
213+
214+
@property
215+
def filtered_matches(self) -> pl.LazyFrame:
216+
"""A df containing up to one row per variant (the best possible match)"""
217+
return self._filtered_matches
218+
219+
@property
220+
def filter_summary(self) -> pl.DataFrame:
221+
"""A log that summarises the impact of filtering"""
222+
return self._filter_summary
223+
224+
@property
225+
def summary_log(self) -> pl.DataFrame:
226+
"""A summary log containing match rates for variants"""
227+
return self._summary_log
228+
205229
def label(
206230
self,
207231
keep_first_match=False,
@@ -220,33 +244,28 @@ def label(
220244
* ``remove_multiallelic`` remove multiallelic variants before matching (default: ``True``)
221245
* ``filter_IDs``: constrain variants to this list of IDs (default, don't constrain)
222246
"""
223-
df = self.df.pipe(
247+
return self.df.pipe(
224248
label_matches,
225249
keep_first_match=keep_first_match,
226250
remove_ambiguous=remove_ambiguous,
227251
skip_flip=skip_flip,
228252
remove_multiallelic=remove_multiallelic,
229253
filter_IDs=filter_IDs,
230254
)
231-
self._labelled = True
232-
self.df = df
233-
return self.df
234255

235256
def filter(self, score_df, min_overlap=0.75, **kwargs):
236257
"""Filter match candidates after labelling according to user parameters"""
237-
if not self._labelled:
238-
self.df = self.label(**kwargs)
258+
if self._match_candidates is None:
259+
self._match_candidates = self.label(**kwargs)
239260

240261
df, filter_summary = filter_scores(
241262
scorefile=score_df,
242-
matches=self.df,
263+
matches=self._match_candidates,
243264
min_overlap=min_overlap,
244265
dataset=self.dataset,
245266
)
246-
self.filter_summary = filter_summary
247-
self.df = df
248-
self._filtered = True
249-
return self.df
267+
268+
return filter_summary, df
250269

251270
def write_scorefiles(
252271
self, directory, score_df, split=False, min_overlap=0.75, **kwargs
@@ -263,35 +282,36 @@ def write_scorefiles(
263282
* Sets up parallel score calculation (pivots data to wide column format)
264283
* Writes scores to a directory, splitting based on chromosome and effect type
265284
"""
266-
if not self._labelled:
267-
_ = self.label(**kwargs) # self.df gets updated
285+
if self.match_candidates is None:
286+
self._match_candidates = self.label(**kwargs)
268287

269-
if not self._filtered:
270-
# score_df = original scoring file variants
271-
_ = self.filter(score_df=score_df, min_overlap=min_overlap)
288+
if self.filtered_matches is None:
289+
self._filter_summary, self._filtered_matches = self.filter(
290+
score_df=score_df, min_overlap=min_overlap
291+
)
272292

273293
# a summary log contains up to one variant (the best match) for each variant
274294
# in the scoring file
275-
self.summary_log = make_summary_log(
276-
match_candidates=self.df,
277-
dataset=self.dataset,
278-
filter_summary=self.filter_summary.lazy(),
279-
scorefile=score_df,
280-
)
295+
if self.summary_log is None:
296+
self._summary_log = make_summary_log(
297+
match_candidates=self._match_candidates,
298+
dataset=self.dataset,
299+
filter_summary=self.filter_summary.lazy(),
300+
scorefile=score_df,
301+
).collect()
281302

282303
# double check log count vs scoring file variant count
283-
self._log_OK = check_log_count(scorefile=score_df, summary_log=self.summary_log)
304+
self._log_OK = check_log_count(
305+
scorefile=score_df, summary_log=self.summary_log.lazy()
306+
)
284307

285308
# will be empty if no scores pass match threshold, so nothing gets written
286-
plink = PlinkFrames.from_matchresult(self.df)
309+
plink = PlinkFrames.from_matchresult(self._filtered_matches)
287310
outfs = []
288311
for frame in plink:
289312
f = frame.write(directory=directory, split=split, dataset=self.dataset)
290313
outfs.append(f)
291314

292-
# collect after joining in check_log_count (can't join df and lazy df)
293-
self.summary_log = self.summary_log.collect()
294-
295315
# error at the end, to allow logs to be generated
296316
for x in self.filter_summary.iter_rows():
297317
try:
@@ -318,19 +338,16 @@ def full_variant_log(self, score_df, **kwargs):
318338
Multiple match candidates may exist for each variant in the original file.
319339
Describe each variant (one variant per row) with match metadata
320340
"""
321-
if not self._labelled:
322-
self.df = self.label(**kwargs)
323-
self._labelled = True
341+
if self.match_candidates is None:
342+
self._match_candidates = self.label(**kwargs)
324343

325-
if not self._filtered:
326-
self.df, self.filter_summary = filter_scores(
327-
scorefile=score_df,
328-
matches=self.df,
329-
min_overlap=kwargs.get("min_overlap", 0.75),
330-
dataset=self.dataset,
344+
if self.filtered_matches is None:
345+
self._filter_summary, self._filtered_matches = self.filter(
346+
score_df=score_df, min_overlap=kwargs.get("min_overlap", 0.75)
331347
)
332-
self._filtered = True
333348

334349
return make_logs(
335-
scorefile=score_df, dataset=self.dataset, match_candidates=self.df
350+
scorefile=score_df,
351+
dataset=self.dataset,
352+
match_candidates=self.match_candidates,
336353
)

0 commit comments

Comments
 (0)