From 4d8d647041fcbd0d93862ded307c5cd86e8cffa3 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Jul 2024 14:25:37 +0100
Subject: [PATCH 01/15] update codecov version

---
 .github/workflows/pytest.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml
index 9a21fbc..1dee333 100644
--- a/.github/workflows/pytest.yaml
+++ b/.github/workflows/pytest.yaml
@@ -52,7 +52,7 @@ jobs:
         working-directory: ${{ inputs.package-directory }}
     
       - name: Upload coverage reports to Codecov
-        uses: codecov/codecov-action@v4.0.1
+        uses: codecov/codecov-action@v4
         with:
             token: ${{ secrets.CODECOV_TOKEN }}
             slug: PGScatalog/pygscatalog

From 7e279dc66c398b0156db4d79ec8ea2e24723e9b6 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Jul 2024 15:04:36 +0100
Subject: [PATCH 02/15] add tests for non-default CLI options

---
 .../src/pgscatalog/match/lib/_match/label.py  | 13 ++++++++++---
 pgscatalog.match/tests/test_match_cli.py      | 19 +++++++++++++++++--
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/pgscatalog.match/src/pgscatalog/match/lib/_match/label.py b/pgscatalog.match/src/pgscatalog/match/lib/_match/label.py
index e7ac80c..2b048c7 100644
--- a/pgscatalog.match/src/pgscatalog/match/lib/_match/label.py
+++ b/pgscatalog.match/src/pgscatalog/match/lib/_match/label.py
@@ -221,6 +221,13 @@ def _label_duplicate_id(df: pl.LazyFrame, keep_first_match: bool) -> pl.LazyFram
 
 
 def _label_biallelic_ambiguous(df: pl.LazyFrame, remove_ambiguous) -> pl.LazyFrame:
+    """
+    Identify ambiguous variants (A/T & C/G SNPs)
+
+    Often scores and genotypes will be on different builds and including these SNPs
+    would involve tallying incorrect dosages if there has been a strand-flip across builds.
+    Even on the same build you can get improperly strand-oriented data.
+    """
     logger.debug("Labelling ambiguous variants")
     ambig = (
         df.with_columns(
@@ -259,7 +266,7 @@ def _label_biallelic_ambiguous(df: pl.LazyFrame, remove_ambiguous) -> pl.LazyFra
         )
     else:
         return (
-            ambig.with_column(pl.lit(False).alias("exclude_ambiguous"))
+            ambig.with_columns(pl.lit(False).alias("exclude_ambiguous"))
             .with_columns(max=pl.max_horizontal("exclude", "exclude_ambiguous"))
             .drop(["exclude", "exclude_ambiguous"])
             .rename({"max": "exclude"})
@@ -301,7 +308,7 @@ def _label_flips(df: pl.LazyFrame, skip_flip: bool) -> pl.LazyFrame:
         )
     else:
         logger.debug("Not excluding flipped matches")
-        return df.with_columns(match_IDs=pl.lit("NA"))
+        return df
 
 
 def _label_filter(df: pl.LazyFrame, filter_IDs: pathlib.Path) -> pl.LazyFrame:
@@ -323,4 +330,4 @@ def _label_filter(df: pl.LazyFrame, filter_IDs: pathlib.Path) -> pl.LazyFrame:
         )
     else:
         logger.info("--filter_IDs not set, skipping filtering")
-        return df
+        return df.with_columns(match_IDs=pl.lit("NA"))
diff --git a/pgscatalog.match/tests/test_match_cli.py b/pgscatalog.match/tests/test_match_cli.py
index e057810..457d661 100644
--- a/pgscatalog.match/tests/test_match_cli.py
+++ b/pgscatalog.match/tests/test_match_cli.py
@@ -66,8 +66,19 @@ def test_multiallelic(tmp_path_factory, multiallelic_variants, good_scorefile):
     assert (outdir / "test_ALL_additive_0.scorefile.gz").exists()
 
 
-def test_match(tmp_path_factory, good_scorefile, good_variants):
-    """Test matching runs without errors with good data"""
+@pytest.mark.parametrize(
+    "ambiguous,multiallelic,skipflip",
+    [
+        ("--keep_ambiguous", "--keep_multiallelic", "--ignore_strand_flips"),
+        (None, None, None),
+    ],
+)
+def test_match(
+    tmp_path_factory, good_scorefile, good_variants, ambiguous, multiallelic, skipflip
+):
+    """Test matching runs without errors with good data
+
+    Parameterised to run twice: with default CLI match args and optional matching arguments"""
     outdir = tmp_path_factory.mktemp("outdir")
 
     args = [
@@ -83,9 +94,13 @@ def test_match(tmp_path_factory, good_scorefile, good_variants):
             str(outdir),
             "--min_overlap",
             "0.75",
+            ambiguous,
+            multiallelic,
+            skipflip,
         )
     ]
     flargs = list(itertools.chain(*args))
+    flargs = [x for x in flargs if x]  # drop parameterised None
 
     with patch("sys.argv", flargs):
         run_match()

From 1eef0af9d047f3c1aeab0ca9e12b9ba1980bce9a Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Jul 2024 15:40:25 +0100
Subject: [PATCH 03/15] don't use codecov token in reusable workflow

---
 .github/workflows/pytest.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml
index 1dee333..34dd6f0 100644
--- a/.github/workflows/pytest.yaml
+++ b/.github/workflows/pytest.yaml
@@ -54,6 +54,7 @@ jobs:
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v4
         with:
-            token: ${{ secrets.CODECOV_TOKEN }}
+            use_oidc: true
+            fail_ci_if_error: true
             slug: PGScatalog/pygscatalog
             flags: ${{ inputs.package-directory }}

From 1c086553dd2ca4b454dddc011671262d6af4cd3f Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Jul 2024 15:42:19 +0100
Subject: [PATCH 04/15] add oidc

---
 .github/workflows/pytest.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml
index 34dd6f0..2c904a1 100644
--- a/.github/workflows/pytest.yaml
+++ b/.github/workflows/pytest.yaml
@@ -29,6 +29,10 @@ jobs:
   pytest:
     runs-on: ubuntu-latest
 
+    permissions:
+      id-token: write
+      contents: read
+    
     steps:
       - uses: actions/checkout@v4
 

From 07927ab4032add5ba251d8db8985b9e18ff144f7 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Jul 2024 15:45:05 +0100
Subject: [PATCH 05/15] add --keep_first_match test

---
 pgscatalog.match/tests/test_match_cli.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/pgscatalog.match/tests/test_match_cli.py b/pgscatalog.match/tests/test_match_cli.py
index 457d661..791738f 100644
--- a/pgscatalog.match/tests/test_match_cli.py
+++ b/pgscatalog.match/tests/test_match_cli.py
@@ -67,14 +67,25 @@ def test_multiallelic(tmp_path_factory, multiallelic_variants, good_scorefile):
 
 
 @pytest.mark.parametrize(
-    "ambiguous,multiallelic,skipflip",
+    "ambiguous,multiallelic,skipflip,keep_first_match",
     [
-        ("--keep_ambiguous", "--keep_multiallelic", "--ignore_strand_flips"),
-        (None, None, None),
+        (
+            "--keep_ambiguous",
+            "--keep_multiallelic",
+            "--ignore_strand_flips",
+            "--keep_first_match",
+        ),
+        (None, None, None, None),
     ],
 )
 def test_match(
-    tmp_path_factory, good_scorefile, good_variants, ambiguous, multiallelic, skipflip
+    tmp_path_factory,
+    good_scorefile,
+    good_variants,
+    ambiguous,
+    multiallelic,
+    skipflip,
+    keep_first_match,
 ):
     """Test matching runs without errors with good data
 
@@ -97,6 +108,7 @@ def test_match(
             ambiguous,
             multiallelic,
             skipflip,
+            keep_first_match,
         )
     ]
     flargs = list(itertools.chain(*args))

From a7513220de1c8ef213efef32add10864e23da398 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Jul 2024 16:16:52 +0100
Subject: [PATCH 06/15] test --filter_IDs

---
 pgscatalog.match/tests/test_match_cli.py | 58 ++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/pgscatalog.match/tests/test_match_cli.py b/pgscatalog.match/tests/test_match_cli.py
index 791738f..ffb9f31 100644
--- a/pgscatalog.match/tests/test_match_cli.py
+++ b/pgscatalog.match/tests/test_match_cli.py
@@ -1,6 +1,8 @@
 import csv
+import gzip
 import itertools
 import os
+import pathlib
 from unittest.mock import patch
 import pytest
 
@@ -28,6 +30,62 @@ def multiallelic_variants(request):
     return request.path.parent / "data" / "multiallelic.pvar"
 
 
+@pytest.fixture()
+def filter_ids(request, tmp_path, good_scorefile):
+    with open(good_scorefile) as f:
+        scorefile = list(csv.DictReader(f, delimiter="\t"))
+    # build variant IDs from the scoring file
+    ids = [
+        ":".join(
+            [x["chr_name"], x["chr_position"], x["effect_allele"], x["other_allele"]]
+        )
+        for x in scorefile
+    ]
+    # grab half of them, use these to filter matches
+    filter_list = ids[: int(len(ids) / 2)]
+    outf = tmp_path / "filter_ids.txt"
+    with open(outf, mode="wt") as f:
+        [f.write(x + "\n") for x in filter_list]
+
+    return pathlib.Path(outf)
+
+
+def test_filter_match(tmp_path_factory, good_variants, good_scorefile, filter_ids):
+    outdir = tmp_path_factory.mktemp("outdir")
+
+    args = [
+        (
+            "pgscatalog-match",
+            "-d",
+            "test",
+            "-s",
+            str(good_scorefile),
+            "-t",
+            str(good_variants),
+            "--outdir",
+            str(outdir),
+            "--min_overlap",
+            "0.75",
+            "--filter_IDs",
+            str(filter_ids),
+        )
+    ]
+    flargs = list(itertools.chain(*args))
+
+    with patch("sys.argv", flargs):
+        run_match()
+
+    with gzip.open(
+        outdir / "test_ALL_additive_0.scorefile.gz", mode="rt"
+    ) as out_f, open(good_scorefile) as f1, open(filter_ids) as f2:
+        n_out_score = len(list(csv.DictReader(out_f, delimiter="\t")))
+        n_scorefile = sum(1 for _ in f1)
+        n_filt = sum(1 for _ in f2)
+
+    # matched output from scorefile has been filtered
+    assert n_out_score < n_filt < n_scorefile
+
+
 def test_multiallelic(tmp_path_factory, multiallelic_variants, good_scorefile):
     outdir = tmp_path_factory.mktemp("outdir")
 

From 92c660650bb10f69325615caccfbf0e016df422a Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 1 Aug 2024 09:47:55 +0100
Subject: [PATCH 07/15] cover batching in pgscatalog-intersect test

---
 .../src/pgscatalog/match/cli/intersect_cli.py        | 12 ++++++++++--
 pgscatalog.match/tests/test_intersect_cli.py         |  3 +++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/pgscatalog.match/src/pgscatalog/match/cli/intersect_cli.py b/pgscatalog.match/src/pgscatalog/match/cli/intersect_cli.py
index feeb5b9..56bcbc5 100644
--- a/pgscatalog.match/src/pgscatalog/match/cli/intersect_cli.py
+++ b/pgscatalog.match/src/pgscatalog/match/cli/intersect_cli.py
@@ -49,7 +49,7 @@ def run_intersect():
                 ([key, v["ID"], v["REF"]], [IS_INDEL, STRANDAMB, IS_MA_REF])
             )
 
-        if count_var_r % 500000 == 0:
+        if count_var_r % args.batch_size == 0:
             heapq.heapify(ref_heap)
             tmppath = tempfile.NamedTemporaryFile(dir=tmpdir, delete=False)
             with open(tmppath.name, "wt") as outf:
@@ -134,7 +134,7 @@ def run_intersect():
                     )
                 )
 
-            if count_var_t % 500000 == 0:
+            if count_var_t % args.batch_size == 0:
                 heapq.heapify(target_heap)
                 tmppath = tempfile.NamedTemporaryFile(dir=tmpdir, delete=False)
                 with open(tmppath.name, "wt") as outf:
@@ -397,6 +397,14 @@ def parse_args(args=None):
     parser.add_argument(
         "--outdir", dest="outdir", required=True, help="<Required> Output directory"
     )
+    parser.add_argument(
+        "--batch_size",
+        dest="batch_size",
+        type=int,
+        default=500000,
+        required=False,
+        help="<Optional> Number of variants processed per batch",
+    )
     return parser.parse_args(args)
 
 
diff --git a/pgscatalog.match/tests/test_intersect_cli.py b/pgscatalog.match/tests/test_intersect_cli.py
index 9ff54fc..d03a4f8 100644
--- a/pgscatalog.match/tests/test_intersect_cli.py
+++ b/pgscatalog.match/tests/test_intersect_cli.py
@@ -30,6 +30,7 @@ def target(request):
 def test_intersect_cli(tmp_path_factory, ref, target, afreq, vmiss):
     outdir = tmp_path_factory.mktemp("outdir")
 
+    # using a low batch size to test batching behaviour works as expected
     args = [
         (
             "pgscatalog-intersect",
@@ -39,6 +40,8 @@ def test_intersect_cli(tmp_path_factory, ref, target, afreq, vmiss):
             str(target),
             "--outdir",
             str(outdir),
+            "--batch_size",
+            "100",
         )
     ]
     flargs = list(itertools.chain(*args))

From 391818029a8961a915349c01a10b527de677dda1 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 1 Aug 2024 11:11:02 +0100
Subject: [PATCH 08/15] add pypi badge to README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index a6b2ba9..5bdf6c1 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,7 @@ that has been converted to namespace packages for modularity and re-use.
 ## User applications 
 
 [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/pgscatalog-utils/README.html)
+[![install with pypi](https://img.shields.io/pypi/v/pgscatalog-utils)](https://pypi.org/project/pgscatalog-utils/)
 
 These CLI applications are used internally by the [PGS Catalog Calculator (`pgsc_calc`)](https://github.com/PGScatalog/pgsc_calc) 
 workflow for calculating PGS and performing common adjustments for genetic ancestry. 

From cec3d47665989199285a7e4c846b34c1438f305c Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 1 Aug 2024 11:20:46 +0100
Subject: [PATCH 09/15] update codecov config

---
 codecov.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/codecov.yml b/codecov.yml
index 085f461..90b115a 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -3,10 +3,9 @@ flag_management:
     carryforward: true
     statuses:
       - type: project
-        target: auto
-        threshold: 1%
+        target: 80%
       - type: patch
-        target: 90%
+        target: 80%
   individual_flags: # exceptions to the default rules above, stated flag by flag
     - name: pgscatalog.core  #fill in your own flag name
       paths: 

From ee0044e407d138fc0d5ae3341cc34cb00ede1675 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 1 Aug 2024 11:35:23 +0100
Subject: [PATCH 10/15] let's try again

---
 codecov.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/codecov.yml b/codecov.yml
index 90b115a..f489462 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,3 +1,10 @@
+coverage:
+  status:
+    project:
+      default:
+        target: 80%
+        threshold: 5
+        
 flag_management:
   default_rules: # the rules that will be followed for any flag added, generally
     carryforward: true

From b6520dd19b34580a91443dd7f732bf0fe09dc072 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 1 Aug 2024 11:42:51 +0100
Subject: [PATCH 11/15] Revert "let's try again"

This reverts commit ee0044e407d138fc0d5ae3341cc34cb00ede1675.
---
 codecov.yml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/codecov.yml b/codecov.yml
index f489462..90b115a 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,10 +1,3 @@
-coverage:
-  status:
-    project:
-      default:
-        target: 80%
-        threshold: 5
-        
 flag_management:
   default_rules: # the rules that will be followed for any flag added, generally
     carryforward: true

From 8fb9c7f2c11a46bfcc63ec0e9341b84940ba05d7 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 1 Aug 2024 11:43:28 +0100
Subject: [PATCH 12/15] bump patch version

---
 pgscatalog.match/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgscatalog.match/pyproject.toml b/pgscatalog.match/pyproject.toml
index 32adfa4..c8f512b 100644
--- a/pgscatalog.match/pyproject.toml
+++ b/pgscatalog.match/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pgscatalog.match"
-version = "0.2.3"
+version = "0.2.4"
 description = "Tools for matching variants in PGS scoring files and target variant information files"
 authors = ["Benjamin Wingfield <bwingfield@ebi.ac.uk>", "Samuel Lambert <sl925@medschl.cam.ac.uk>", "Laurent Gil <lg10@sanger.ac.uk>"]
 readme = "README.md"

From 454834eb3489d424322c68aaccf2c73454e2c938 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 1 Aug 2024 11:46:56 +0100
Subject: [PATCH 13/15] bump checkout version

---
 .github/workflows/ruff.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 87a0780..7421606 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -11,5 +11,5 @@ jobs:
   ruff:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
-      - uses: chartboost/ruff-action@v1
\ No newline at end of file
+      - uses: actions/checkout@v4
+      - uses: chartboost/ruff-action@v1

From 5a5efeb0f85d9573dac7f5d5835b861bcc329ec1 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 2 Aug 2024 12:09:00 +0100
Subject: [PATCH 14/15] Fix variant deduplication (#38)

* bump checkout version

* fix deduplicating

* add deduplication end to end test
---
 .../src/pgscatalog/match/lib/_match/plink.py  | 105 ++++++++++++------
 .../tests/data/duplicated_scorefile.txt       |   5 +
 pgscatalog.match/tests/test_match_cli.py      |  50 +++++++++
 3 files changed, 125 insertions(+), 35 deletions(-)
 create mode 100644 pgscatalog.match/tests/data/duplicated_scorefile.txt

diff --git a/pgscatalog.match/src/pgscatalog/match/lib/_match/plink.py b/pgscatalog.match/src/pgscatalog/match/lib/_match/plink.py
index 5e05705..fd371f6 100644
--- a/pgscatalog.match/src/pgscatalog/match/lib/_match/plink.py
+++ b/pgscatalog.match/src/pgscatalog/match/lib/_match/plink.py
@@ -14,8 +14,43 @@ def plinkify(df):
     Variants with the same ID and different effect alleles must be split into different files
 
     Input df must contain match candidates filtered to best matched variant
+
+    Some fake match data where effect_allele is the same but other_allele is different:
+
+    >>> data = {'row_nr': [1, 2, 3], 'chr_name': ['11', '11', '11'], 'chr_position': [69331418, 69331418, 69331419], 'effect_allele': ['A', 'C', 'T'], 'other_allele': ['C', 'C', 'T'], 'effect_weight': ['1', '1.5', '2'], 'effect_type': ['additive', 'additive', 'additive'], 'accession': ['pgs1', 'pgs2', 'pgs3'],'ID': ['11:69331418:A:C', '11:69331418:A:C', '11:69331419:T:T'],'matched_effect_allele': ['A', 'C', 'T']}
+    >>> plinked = plinkify(pl.DataFrame(data).lazy())
+    >>> dfs = sorted((x.collect() for x in plinked[EffectType.ADDITIVE]), key= lambda x: len(x))
+    >>> assert len(dfs) == 2
+    >>> dfs[0].select(["row_nr", "accession", "ID", "matched_effect_allele"])
+    shape: (1, 4)
+    ┌────────┬───────────┬─────────────────┬───────────────────────┐
+    │ row_nr ┆ accession ┆ ID              ┆ matched_effect_allele │
+    │ ---    ┆ ---       ┆ ---             ┆ ---                   │
+    │ i64    ┆ str       ┆ str             ┆ str                   │
+    ╞════════╪═══════════╪═════════════════╪═══════════════════════╡
+    │ 2      ┆ pgs2      ┆ 11:69331418:A:C ┆ C                     │
+    └────────┴───────────┴─────────────────┴───────────────────────┘
+
+    >>> dfs[1].select(["row_nr", "accession", "ID", "matched_effect_allele"])
+    shape: (2, 4)
+    ┌────────┬───────────┬─────────────────┬───────────────────────┐
+    │ row_nr ┆ accession ┆ ID              ┆ matched_effect_allele │
+    │ ---    ┆ ---       ┆ ---             ┆ ---                   │
+    │ i64    ┆ str       ┆ str             ┆ str                   │
+    ╞════════╪═══════════╪═════════════════╪═══════════════════════╡
+    │ 1      ┆ pgs1      ┆ 11:69331418:A:C ┆ A                     │
+    │ 3      ┆ pgs3      ┆ 11:69331419:T:T ┆ T                     │
+    └────────┴───────────┴─────────────────┴───────────────────────┘
+
+    When merging a lot of scoring files, sometimes a variant might be duplicated
+    this can happen when the matched effect allele differs at the same position, e.g.:
+        - chr1: chr2:20003:A:C A 0.3 NA
+        - chr1: chr2:20003:A:C C NA 0.7
+    where the last two columns represent different scores.  plink demands
+    unique identifiers! so need to split, score, and sum later.
     """
     min_cols = [
+        "row_nr",
         "accession",
         "effect_type",
         "chr_name",
@@ -36,7 +71,7 @@ def plinkify(df):
     deduped = {}
     for effect_type, effect_frame in effect_frames:
         deduped.setdefault(effect_type, [])
-        deduped[effect_type].append(effect_frame)
+        deduped[effect_type] = _deduplicate_variants(effect_frame)
 
     return deduped
 
@@ -94,7 +129,7 @@ def _split_effect_type(
     return additive, dominant, recessive
 
 
-def _deduplicate_variants(df: pl.LazyFrame) -> list[pl.LazyFrame | None]:
+def _deduplicate_variants(df: pl.LazyFrame) -> list[pl.LazyFrame]:
     """Find variant matches that have duplicate identifiers
     When merging a lot of scoring files, sometimes a variant might be duplicated
     this can happen when the matched effect allele differs at the same position, e.g.:
@@ -108,44 +143,44 @@ def _deduplicate_variants(df: pl.LazyFrame) -> list[pl.LazyFrame | None]:
     Returns:
         A list of dataframes, with unique ID - matched effect allele combinations
     """
-    if df.select("ID").head().collect().is_empty():
+    if df.limit(1).collect().is_empty():
         logger.info("Empty input: skipping deduplication")
-        return [None]
-    else:
-        logger.debug("Deduplicating variants")
-
-    # 1. unique ID - EA is important because normal duplicates are already
-    #   handled by pivoting, and it's pointless to split them unnecessarily
-    # 2. use cumcount to number duplicate IDs
-    # 3. join cumcount data on original DF, use this data for splitting
-    # note: effect_allele should be equivalent to matched_effect_allele
-    ea_count: pl.LazyFrame = (
-        df.select(["ID", "matched_effect_allele"])
-        .unique()
-        .with_columns(
+        return [df]
+    logger.debug("Deduplicating variants (splitting across files)")
+    ea_count: pl.LazyFrame = df.unique(
+        ["ID", "matched_effect_allele"], maintain_order=True
+    ).with_columns([pl.col(["ID"]).cum_count().over(["ID"]).alias("cum_count")])
+
+    # now get groups for each different cumulative count (1 instance, 2, ..., n)
+    groups = ea_count.collect().group_by(["cum_count"], maintain_order=True)
+    group_dfs = (x.lazy() for _, x in groups)
+    ldf_lst = []
+
+    for x in group_dfs:
+        tempdf = df.join(
+            x, on=["row_nr", "ID", "matched_effect_allele"], how="inner"
+        ).select(
             [
-                pl.col("ID").cumcount().over(["ID"]).alias("cumcount"),
-                pl.col("ID").count().over(["ID"]).alias("count"),
+                "row_nr",
+                "accession",
+                "effect_type",
+                "chr_name",
+                "ID",
+                "matched_effect_allele",
+                "effect_weight",
             ]
         )
-    )
-
-    dup_label: pl.LazyFrame = df.join(
-        ea_count, on=["ID", "matched_effect_allele"], how="left"
-    )
+        ldf_lst.append(tempdf)
 
-    # now split the matched variants, and make sure we don't lose any
-    # cumcount = ngroup-1, so add 1
-    n_splits: int = ea_count.select("cumcount").max().collect().item(0, 0) + 1
-    df_lst: list = []
+    # double check to help me sleep at night
+    original_length = df.select(pl.len()).collect().item(0, 0)
+    new_length = sum(x.select(pl.len()).collect().item(0, 0) for x in ldf_lst)
 
-    for i in range(0, n_splits):
-        x: pl.LazyFrame = dup_label.filter(pl.col("cumcount") == i).drop(
-            ["cumcount", "count"]
+    if original_length != new_length:
+        raise ValueError(
+            f"Some variants went missing during deduplication ({original_length} != {new_length}) "
+            "This should never happen"
         )
-        if (df := x.collect()).is_empty():
-            continue
-        else:
-            df_lst.append(df.lazy())
 
-    return df_lst
+    logger.info("Dataframe lengths are OK after deduplicating :)")
+    return ldf_lst
diff --git a/pgscatalog.match/tests/data/duplicated_scorefile.txt b/pgscatalog.match/tests/data/duplicated_scorefile.txt
new file mode 100644
index 0000000..563a965
--- /dev/null
+++ b/pgscatalog.match/tests/data/duplicated_scorefile.txt
@@ -0,0 +1,5 @@
+chr_name	chr_position	effect_allele	other_allele	effect_weight	effect_type	is_duplicated	accession	row_nr
+11	69331418	C	T	0.210422987	additive	FALSE	test	0
+11	69331418	T	C	0.210422987	additive	FALSE	test2	1
+11	69379161	A	C	0.018723614	additive	FALSE	test2	2
+5	1279790	T	C	-0.005213567	additive	FALSE	test3	4
diff --git a/pgscatalog.match/tests/test_match_cli.py b/pgscatalog.match/tests/test_match_cli.py
index ffb9f31..178b11f 100644
--- a/pgscatalog.match/tests/test_match_cli.py
+++ b/pgscatalog.match/tests/test_match_cli.py
@@ -15,6 +15,11 @@ def good_scorefile(request):
     return request.path.parent / "data" / "good_match_scorefile.txt"
 
 
+@pytest.fixture(scope="module")
+def duplicated_scorefile(request):
+    return request.path.parent / "data" / "duplicated_scorefile.txt"
+
+
 @pytest.fixture(scope="module")
 def bad_scorefile(request):
     return request.path.parent / "data" / "bad_match_scorefile.txt.gz"
@@ -86,6 +91,51 @@ def test_filter_match(tmp_path_factory, good_variants, good_scorefile, filter_id
     assert n_out_score < n_filt < n_scorefile
 
 
+def test_duplicated(tmp_path_factory, good_variants, duplicated_scorefile):
+    """A scoring file where the same variant ID has different effect alleles from different scores
+    Instead of pivoting wider, these variants must be split across different files
+    (plink2 doesn't like duplicated variants).
+    """
+    outdir = tmp_path_factory.mktemp("outdir")
+
+    args = [
+        (
+            "pgscatalog-match",
+            "-d",
+            "test",
+            "-s",
+            str(duplicated_scorefile),
+            "-t",
+            str(good_variants),
+            "--outdir",
+            str(outdir),
+            "--min_overlap",
+            "0.75",
+        )
+    ]
+    flargs = list(itertools.chain(*args))
+
+    with patch("sys.argv", flargs):
+        run_match()
+
+    # scoring files seem to be split properly
+    assert (outdir / "test_ALL_additive_0.scorefile.gz").exists()
+    assert (outdir / "test_ALL_additive_1.scorefile.gz").exists()
+
+    # test the split happened
+    with gzip.open(
+        outdir / "test_ALL_additive_0.scorefile.gz", mode="rt"
+    ) as f1, gzip.open(outdir / "test_ALL_additive_1.scorefile.gz", mode="rt") as f2:
+        f1variants = list(csv.DictReader(f1, delimiter="\t"))
+        f2variants = list(csv.DictReader(f2, delimiter="\t"))
+
+    # variants were split correctly across the files
+    assert len(f1variants) == 3 and len(f2variants) == 1
+    # test2 and test3 PGS have been pivoted
+    assert all("test2" in x and "test3" in x for x in f1variants)
+    assert all("test" in x for x in f2variants)
+
+
 def test_multiallelic(tmp_path_factory, multiallelic_variants, good_scorefile):
     outdir = tmp_path_factory.mktemp("outdir")
 

From 2aed633184f7ee611cf3d9631fa6d4fda2eae765 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 2 Aug 2024 12:18:15 +0100
Subject: [PATCH 15/15] bump minor version

---
 pgscatalog.match/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgscatalog.match/pyproject.toml b/pgscatalog.match/pyproject.toml
index c8f512b..54bee6b 100644
--- a/pgscatalog.match/pyproject.toml
+++ b/pgscatalog.match/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pgscatalog.match"
-version = "0.2.4"
+version = "0.3.0"
 description = "Tools for matching variants in PGS scoring files and target variant information files"
 authors = ["Benjamin Wingfield <bwingfield@ebi.ac.uk>", "Samuel Lambert <sl925@medschl.cam.ac.uk>", "Laurent Gil <lg10@sanger.ac.uk>"]
 readme = "README.md"