@@ -152,20 +152,20 @@ class MatchResults(collections.abc.Sequence):
152
152
153
153
>>> with pl.Config(tbl_formatting="ASCII_MARKDOWN", tbl_hide_column_data_types=True, tbl_width_chars=120), scorefile as score_df:
154
154
... MatchResults(x).full_variant_log(score_df).fetch() # +ELLIPSIS
155
- shape: (155 , 23)
155
+ shape: (169 , 23)
156
156
| row_nr | accession | chr_name | chr_position | … | duplicate_ID | match_IDs | match_status | dataset |
157
157
|--------|-----------|----------|--------------|---|--------------|-----------|--------------|-----------|
158
- | 0 | PGS000002 | 11 | 69331418 | … | null | null | unmatched | goodmatch |
158
+ | 0 | PGS000002 | 11 | 69331418 | … | true | NA | excluded | goodmatch |
159
159
| 1 | PGS000002 | 11 | 69379161 | … | false | NA | matched | goodmatch |
160
- | 2 | PGS000002 | 11 | 69331642 | … | null | null | unmatched | goodmatch |
160
+ | 2 | PGS000002 | 11 | 69331642 | … | false | NA | excluded | goodmatch |
161
+ | 2 | PGS000002 | 11 | 69331642 | … | false | NA | not_best | goodmatch |
161
162
| 3 | PGS000002 | 5 | 1282319 | … | false | NA | matched | goodmatch |
162
- | 4 | PGS000002 | 5 | 1279790 | … | false | NA | matched | goodmatch |
163
163
| … | … | … | … | … | … | … | … | … |
164
- | 72 | PGS000001 | 22 | 40876234 | … | false | NA | matched | goodmatch |
165
164
| 73 | PGS000001 | 1 | 204518842 | … | false | NA | matched | goodmatch |
166
165
| 74 | PGS000001 | 1 | 202187176 | … | false | NA | matched | goodmatch |
167
166
| 75 | PGS000001 | 2 | 19320803 | … | false | NA | matched | goodmatch |
168
- | 76 | PGS000001 | 16 | 53855291 | … | null | null | unmatched | goodmatch |
167
+ | 76 | PGS000001 | 16 | 53855291 | … | false | NA | excluded | goodmatch |
168
+ | 76 | PGS000001 | 16 | 53855291 | … | false | NA | not_best | goodmatch |
169
169
"""
170
170
171
171
def __init__ (self , * elements ):
@@ -177,19 +177,18 @@ def __init__(self, *elements):
177
177
)
178
178
179
179
self .dataset = self ._elements [0 ].dataset
180
+
180
181
# a df composed of all match result elements
181
- self .df = pl .scan_ipc (x .ipc_path for x in self ._elements )
182
+ self ._df = pl .scan_ipc (x .ipc_path for x in self ._elements )
182
183
if self .df .select ("row_nr" ).collect ().is_empty ():
183
184
raise ZeroMatchesError ("No match candidates found for any scoring files" )
184
185
185
- # a table containing up to one row per variant (the best possible match)
186
- self .filter_summary = None
187
- # a summary log containing match rates for variants
188
- self .summary_log = None
189
- # have match candidates in df been labelled?
190
- self ._labelled = False
191
- # have match candidates in df been filtered?
192
- self ._filtered = False
186
+ # internal dataframes
187
+ self ._match_candidates = None
188
+ self ._filtered_matches = None
189
+ self ._filter_summary = None
190
+ self ._summary_log = None
191
+
193
192
# does the input scoring file count match the variant log count?
194
193
self ._log_OK = None
195
194
@@ -202,6 +201,31 @@ def __getitem__(self, item):
202
201
def __repr__ (self ):
203
202
return f"{ type (self ).__name__ } ({ self ._elements !r} )"
204
203
204
+ @property
205
+ def df (self ) -> pl .LazyFrame :
206
+ """A df containing raw match results"""
207
+ return self ._df
208
+
209
+ @property
210
+ def match_candidates (self ) -> pl .LazyFrame :
211
+ """A df containing all possible matches for each input score variant"""
212
+ return self ._match_candidates
213
+
214
+ @property
215
+ def filtered_matches (self ) -> pl .LazyFrame :
216
+ """A df containing up to one row per variant (the best possible match)"""
217
+ return self ._filtered_matches
218
+
219
+ @property
220
+ def filter_summary (self ) -> pl .DataFrame :
221
+ """A log that summarises the impact of filtering"""
222
+ return self ._filter_summary
223
+
224
+ @property
225
+ def summary_log (self ) -> pl .DataFrame :
226
+ """A summary log containing match rates for variants"""
227
+ return self ._summary_log
228
+
205
229
def label (
206
230
self ,
207
231
keep_first_match = False ,
@@ -220,33 +244,28 @@ def label(
220
244
* ``remove_multiallelic`` remove multiallelic variants before matching (default: ``True``)
221
245
* ``filter_IDs``: constrain variants to this list of IDs (default, don't constrain)
222
246
"""
223
- df = self .df .pipe (
247
+ return self .df .pipe (
224
248
label_matches ,
225
249
keep_first_match = keep_first_match ,
226
250
remove_ambiguous = remove_ambiguous ,
227
251
skip_flip = skip_flip ,
228
252
remove_multiallelic = remove_multiallelic ,
229
253
filter_IDs = filter_IDs ,
230
254
)
231
- self ._labelled = True
232
- self .df = df
233
- return self .df
234
255
235
256
def filter (self , score_df , min_overlap = 0.75 , ** kwargs ):
236
257
"""Filter match candidates after labelling according to user parameters"""
237
- if not self ._labelled :
238
- self .df = self .label (** kwargs )
258
+ if self ._match_candidates is None :
259
+ self ._match_candidates = self .label (** kwargs )
239
260
240
261
df , filter_summary = filter_scores (
241
262
scorefile = score_df ,
242
- matches = self .df ,
263
+ matches = self ._match_candidates ,
243
264
min_overlap = min_overlap ,
244
265
dataset = self .dataset ,
245
266
)
246
- self .filter_summary = filter_summary
247
- self .df = df
248
- self ._filtered = True
249
- return self .df
267
+
268
+ return filter_summary , df
250
269
251
270
def write_scorefiles (
252
271
self , directory , score_df , split = False , min_overlap = 0.75 , ** kwargs
@@ -263,35 +282,36 @@ def write_scorefiles(
263
282
* Sets up parallel score calculation (pivots data to wide column format)
264
283
* Writes scores to a directory, splitting based on chromosome and effect type
265
284
"""
266
- if not self ._labelled :
267
- _ = self .label (** kwargs ) # self.df gets updated
285
+ if self .match_candidates is None :
286
+ self . _match_candidates = self .label (** kwargs )
268
287
269
- if not self ._filtered :
270
- # score_df = original scoring file variants
271
- _ = self .filter (score_df = score_df , min_overlap = min_overlap )
288
+ if self .filtered_matches is None :
289
+ self ._filter_summary , self ._filtered_matches = self .filter (
290
+ score_df = score_df , min_overlap = min_overlap
291
+ )
272
292
273
293
# a summary log contains up to one variant (the best match) for each variant
274
294
# in the scoring file
275
- self .summary_log = make_summary_log (
276
- match_candidates = self .df ,
277
- dataset = self .dataset ,
278
- filter_summary = self .filter_summary .lazy (),
279
- scorefile = score_df ,
280
- )
295
+ if self .summary_log is None :
296
+ self ._summary_log = make_summary_log (
297
+ match_candidates = self ._match_candidates ,
298
+ dataset = self .dataset ,
299
+ filter_summary = self .filter_summary .lazy (),
300
+ scorefile = score_df ,
301
+ ).collect ()
281
302
282
303
# double check log count vs scoring file variant count
283
- self ._log_OK = check_log_count (scorefile = score_df , summary_log = self .summary_log )
304
+ self ._log_OK = check_log_count (
305
+ scorefile = score_df , summary_log = self .summary_log .lazy ()
306
+ )
284
307
285
308
# will be empty if no scores pass match threshold, so nothing gets written
286
- plink = PlinkFrames .from_matchresult (self .df )
309
+ plink = PlinkFrames .from_matchresult (self ._filtered_matches )
287
310
outfs = []
288
311
for frame in plink :
289
312
f = frame .write (directory = directory , split = split , dataset = self .dataset )
290
313
outfs .append (f )
291
314
292
- # collect after joining in check_log_count (can't join df and lazy df)
293
- self .summary_log = self .summary_log .collect ()
294
-
295
315
# error at the end, to allow logs to be generated
296
316
for x in self .filter_summary .iter_rows ():
297
317
try :
@@ -318,19 +338,16 @@ def full_variant_log(self, score_df, **kwargs):
318
338
Multiple match candidates may exist for each variant in the original file.
319
339
Describe each variant (one variant per row) with match metadata
320
340
"""
321
- if not self ._labelled :
322
- self .df = self .label (** kwargs )
323
- self ._labelled = True
341
+ if self .match_candidates is None :
342
+ self ._match_candidates = self .label (** kwargs )
324
343
325
- if not self ._filtered :
326
- self .df , self .filter_summary = filter_scores (
327
- scorefile = score_df ,
328
- matches = self .df ,
329
- min_overlap = kwargs .get ("min_overlap" , 0.75 ),
330
- dataset = self .dataset ,
344
+ if self .filtered_matches is None :
345
+ self ._filter_summary , self ._filtered_matches = self .filter (
346
+ score_df = score_df , min_overlap = kwargs .get ("min_overlap" , 0.75 )
331
347
)
332
- self ._filtered = True
333
348
334
349
return make_logs (
335
- scorefile = score_df , dataset = self .dataset , match_candidates = self .df
350
+ scorefile = score_df ,
351
+ dataset = self .dataset ,
352
+ match_candidates = self .match_candidates ,
336
353
)
0 commit comments