@@ -19,29 +19,29 @@ def plinkify(df):
19
19
20
20
>>> data = {'row_nr': [1, 1, 1], 'chr_name': ['11', '11', '11'], 'chr_position': [69331418, 69331418, 69331419], 'effect_allele': ['A', 'C', 'T'], 'other_allele': ['C', 'C', 'T'], 'effect_weight': ['1', '1.5', '2'], 'effect_type': ['additive', 'additive', 'additive'], 'accession': ['pgs1', 'pgs2', 'pgs3'],'ID': ['11:69331418:A:C', '11:69331418:A:C', '11:69331419:T:T'],'matched_effect_allele': ['A', 'C', 'T']}
21
21
>>> plinked = plinkify(pl.DataFrame(data).lazy())
22
- >>> dfs = sorted(( x.collect() for x in plinked[EffectType.ADDITIVE]), key= lambda x: len(x))
22
+ >>> dfs = [ x.collect() for x in plinked[EffectType.ADDITIVE]]
23
23
>>> len(dfs)
24
24
2
25
25
26
26
>>> dfs[0].select(["row_nr", "accession", "ID", "matched_effect_allele"])
27
- shape: (1 , 4)
27
+ shape: (2 , 4)
28
28
┌────────┬───────────┬─────────────────┬───────────────────────┐
29
29
│ row_nr ┆ accession ┆ ID ┆ matched_effect_allele │
30
30
│ --- ┆ --- ┆ --- ┆ --- │
31
31
│ i64 ┆ str ┆ str ┆ str │
32
32
╞════════╪═══════════╪═════════════════╪═══════════════════════╡
33
- │ 1 ┆ pgs2 ┆ 11:69331418:A:C ┆ C │
33
+ │ 1 ┆ pgs1 ┆ 11:69331418:A:C ┆ A │
34
+ │ 1 ┆ pgs3 ┆ 11:69331419:T:T ┆ T │
34
35
└────────┴───────────┴─────────────────┴───────────────────────┘
35
36
36
37
>>> dfs[1].select(["row_nr", "accession", "ID", "matched_effect_allele"])
37
- shape: (2 , 4)
38
+ shape: (1 , 4)
38
39
┌────────┬───────────┬─────────────────┬───────────────────────┐
39
40
│ row_nr ┆ accession ┆ ID ┆ matched_effect_allele │
40
41
│ --- ┆ --- ┆ --- ┆ --- │
41
42
│ i64 ┆ str ┆ str ┆ str │
42
43
╞════════╪═══════════╪═════════════════╪═══════════════════════╡
43
- │ 1 ┆ pgs1 ┆ 11:69331418:A:C ┆ A │
44
- │ 1 ┆ pgs3 ┆ 11:69331419:T:T ┆ T │
44
+ │ 1 ┆ pgs2 ┆ 11:69331418:A:C ┆ C │
45
45
└────────┴───────────┴─────────────────┴───────────────────────┘
46
46
47
47
When merging a lot of scoring files, sometimes a variant might be duplicated
@@ -201,4 +201,6 @@ def _deduplicate_variants(
201
201
)
202
202
203
203
logger .info ("Split dataframe lengths are consistent with input after deduplicating" )
204
- return ldf_lst
204
+ return sorted (
205
+ ldf_lst , key = lambda x : x .select (pl .len ()).collect ().item (), reverse = True
206
+ )
0 commit comments