Skip to content

Commit 2dc348a

Browse files
committed
sort _deduplicate_variants output by df length
1 parent 9ee5293 commit 2dc348a

File tree

1 file changed

+9
-7
lines changed
  • pgscatalog.match/src/pgscatalog/match/lib/_match

1 file changed

+9
-7
lines changed

pgscatalog.match/src/pgscatalog/match/lib/_match/plink.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -19,29 +19,29 @@ def plinkify(df):
1919
2020
>>> data = {'row_nr': [1, 1, 1], 'chr_name': ['11', '11', '11'], 'chr_position': [69331418, 69331418, 69331419], 'effect_allele': ['A', 'C', 'T'], 'other_allele': ['C', 'C', 'T'], 'effect_weight': ['1', '1.5', '2'], 'effect_type': ['additive', 'additive', 'additive'], 'accession': ['pgs1', 'pgs2', 'pgs3'],'ID': ['11:69331418:A:C', '11:69331418:A:C', '11:69331419:T:T'],'matched_effect_allele': ['A', 'C', 'T']}
2121
>>> plinked = plinkify(pl.DataFrame(data).lazy())
22-
>>> dfs = sorted((x.collect() for x in plinked[EffectType.ADDITIVE]), key= lambda x: len(x))
22+
>>> dfs = [x.collect() for x in plinked[EffectType.ADDITIVE]]
2323
>>> len(dfs)
2424
2
2525
2626
>>> dfs[0].select(["row_nr", "accession", "ID", "matched_effect_allele"])
27-
shape: (1, 4)
27+
shape: (2, 4)
2828
┌────────┬───────────┬─────────────────┬───────────────────────┐
2929
│ row_nr ┆ accession ┆ ID ┆ matched_effect_allele │
3030
│ --- ┆ --- ┆ --- ┆ --- │
3131
│ i64 ┆ str ┆ str ┆ str │
3232
╞════════╪═══════════╪═════════════════╪═══════════════════════╡
33-
│ 1 ┆ pgs2 ┆ 11:69331418:A:C ┆ C │
33+
│ 1 ┆ pgs1 ┆ 11:69331418:A:C ┆ A │
34+
│ 1 ┆ pgs3 ┆ 11:69331419:T:T ┆ T │
3435
└────────┴───────────┴─────────────────┴───────────────────────┘
3536
3637
>>> dfs[1].select(["row_nr", "accession", "ID", "matched_effect_allele"])
37-
shape: (2, 4)
38+
shape: (1, 4)
3839
┌────────┬───────────┬─────────────────┬───────────────────────┐
3940
│ row_nr ┆ accession ┆ ID ┆ matched_effect_allele │
4041
│ --- ┆ --- ┆ --- ┆ --- │
4142
│ i64 ┆ str ┆ str ┆ str │
4243
╞════════╪═══════════╪═════════════════╪═══════════════════════╡
43-
│ 1 ┆ pgs1 ┆ 11:69331418:A:C ┆ A │
44-
│ 1 ┆ pgs3 ┆ 11:69331419:T:T ┆ T │
44+
│ 1 ┆ pgs2 ┆ 11:69331418:A:C ┆ C │
4545
└────────┴───────────┴─────────────────┴───────────────────────┘
4646
4747
When merging a lot of scoring files, sometimes a variant might be duplicated
@@ -201,4 +201,6 @@ def _deduplicate_variants(
201201
)
202202

203203
logger.info("Split dataframe lengths are consistent with input after deduplicating")
204-
return ldf_lst
204+
return sorted(
205+
ldf_lst, key=lambda x: x.select(pl.len()).collect().item(), reverse=True
206+
)

0 commit comments

Comments
 (0)