8
8
from ._plinkframe import PlinkFrames
9
9
from ._match .label import label_matches
10
10
from ._match .filter import filter_scores
11
+ from ._match .log import make_logs , check_log_count , make_summary_log
11
12
12
13
logger = logging .getLogger (__name__ )
13
14
@@ -136,6 +137,22 @@ class MatchResults(collections.abc.Sequence):
136
137
>>> sum("additive" in f for f in scorefiles)
137
138
19
138
139
>>> assert len(scorefiles) == 21
140
+
141
+ An important part of matching variants is reporting a log:
142
+ >>> with scorefile as score_df:
143
+ ... MatchResults(x).full_variant_log(score_df).fetch(2)
144
+ shape: (3, 23)
145
+ ┌────────┬───────────┬──────────┬────────────┬───┬────────────┬───────────┬────────────┬───────────┐
146
+ │ row_nr ┆ accession ┆ chr_name ┆ chr_positi ┆ … ┆ duplicate_ ┆ match_IDs ┆ match_stat ┆ dataset │
147
+ │ --- ┆ --- ┆ --- ┆ on ┆ ┆ ID ┆ --- ┆ us ┆ --- │
148
+ │ u64 ┆ cat ┆ cat ┆ --- ┆ ┆ --- ┆ null ┆ --- ┆ cat │
149
+ │ ┆ ┆ ┆ u64 ┆ ┆ bool ┆ ┆ cat ┆ │
150
+ ╞════════╪═══════════╪══════════╪════════════╪═══╪════════════╪═══════════╪════════════╪═══════════╡
151
+ │ null ┆ null ┆ null ┆ null ┆ … ┆ false ┆ null ┆ matched ┆ goodmatch │
152
+ │ 0 ┆ PGS000002 ┆ 11 ┆ 69331418 ┆ … ┆ false ┆ null ┆ matched ┆ goodmatch │
153
+ │ 77 ┆ PGS000002 ┆ 11 ┆ 69331418 ┆ … ┆ null ┆ null ┆ unmatched ┆ goodmatch │
154
+ └────────┴───────────┴──────────┴────────────┴───┴────────────┴───────────┴────────────┴───────────┘
155
+
139
156
"""
140
157
141
158
def __init__ (self , * elements ):
@@ -149,7 +166,14 @@ def __init__(self, *elements):
149
166
self .dataset = self ._elements [0 ].dataset
150
167
# a df composed of all match result elements
151
168
self .df = pl .scan_ipc (x .ipc_path for x in self ._elements )
169
+ # a summary table containing match rates
170
+ self .filter_summary = None
171
+ # have match candidates in df been labelled?
152
172
self ._labelled = False
173
+ # have match candidates in df been filtered?
174
+ self ._filtered = False
175
+ # does the input scoring file count match the variant log count?
176
+ self ._log_OK = None
153
177
154
178
def __len__ (self ):
155
179
return len (self ._elements )
@@ -189,24 +213,58 @@ def write_scorefiles(self, directory, score_df, split=False, **kwargs):
189
213
if not self ._labelled :
190
214
self .df = self .label (** kwargs )
191
215
192
- self .df , self .score_summary = filter_scores (
216
+ self .df , self .filter_summary = filter_scores (
193
217
scorefile = score_df ,
194
218
matches = self .df ,
195
219
min_overlap = kwargs .get ("min_overlap" , 0.75 ),
196
220
dataset = self .dataset ,
197
221
)
198
- self .filtered = True
222
+ self ._filtered = True
199
223
200
- if self .score_summary .is_empty ():
224
+ # a filter summary contains match rates for each accession, and a column
225
+ # indicating if the score passes the minimum matching threshold
226
+ if self .filter_summary .is_empty ():
201
227
# can happen if min_overlap = 0
202
228
raise ZeroMatchesError (
203
229
"Error: no target variants match any variants in scoring files"
204
230
)
205
231
232
+ # a summary log contains up to one variant (the best match) for each variant
233
+ # in the scoring file
234
+ self .summary_log = make_summary_log (
235
+ match_candidates = self .df ,
236
+ dataset = self .dataset ,
237
+ filter_summary = self .filter_summary .lazy (),
238
+ scorefile = score_df ,
239
+ )
240
+
241
+ # double check log count vs scoring file variant count
242
+ self ._log_OK = check_log_count (scorefile = score_df , summary_log = self .summary_log )
243
+
206
244
plink = PlinkFrames .from_matchresult (self .df )
207
245
208
246
for frame in plink :
209
247
frame .write (directory = directory , split = split , dataset = self .dataset )
210
248
211
- def full_variant_log (self ):
212
- raise NotImplementedError
249
+ def full_variant_log (self , score_df , ** kwargs ):
250
+ """Generate a log for each variant in a scoring file
251
+
252
+ Multiple match candidates may exist for each variant in the original file.
253
+ Describe each variant (one variant per row) with match metadata
254
+ """
255
+ if not self ._labelled :
256
+ self .df = self .label (** kwargs )
257
+ self ._labelled = True
258
+
259
+ if not self ._filtered :
260
+ self .df , self .filter_summary = filter_scores (
261
+ scorefile = score_df ,
262
+ matches = self .df ,
263
+ min_overlap = kwargs .get ("min_overlap" , 0.75 ),
264
+ dataset = self .dataset ,
265
+ )
266
+ self ._filtered = True
267
+
268
+ return make_logs (
269
+ scorefile = score_df , dataset = self .dataset , match_candidates = self .df
270
+ )
0 commit comments