PGScatalog
diff --git a/‎.github/workflows/calcapp-pytest.yml
+13 b/‎.github/workflows/calcapp-pytest.yml
+13
diff --git a/‎pgscatalog.calcapp/README.md b/‎pgscatalog.calcapp/README.md
diff --git a/‎pgscatalog.calcapp/poetry.lock
+720 b/‎pgscatalog.calcapp/poetry.lock
+720
diff --git a/‎pgscatalog.calcapp/poetry.toml
+3 b/‎pgscatalog.calcapp/poetry.toml
+3
diff --git a/‎pgscatalog.calcapp/pyproject.toml
+23 b/‎pgscatalog.calcapp/pyproject.toml
+23
diff --git a/‎pgscatalog.calcapp/src/pgscatalog/calcapp/__init__.py
+3 b/‎pgscatalog.calcapp/src/pgscatalog/calcapp/__init__.py
+3
diff --git a/‎pgscatalog.calcapp/src/pgscatalog/calcapp/aggregate_cli.py
+75 b/‎pgscatalog.calcapp/src/pgscatalog/calcapp/aggregate_cli.py
+75
diff --git a/‎pgscatalog.calcapp/tests/test_aggregate.py
+67 b/‎pgscatalog.calcapp/tests/test_aggregate.py
+67
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_10_additive_0.sscore.zst
16 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_10_additive_0.sscore.zst
16 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_11_additive_0.sscore.zst
16.1 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_11_additive_0.sscore.zst
16.1 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_12_additive_0.sscore.zst
15.9 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_12_additive_0.sscore.zst
15.9 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_13_additive_0.sscore.zst
16 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_13_additive_0.sscore.zst
16 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_14_additive_0.sscore.zst
15.5 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_14_additive_0.sscore.zst
15.5 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_15_additive_0.sscore.zst
16.1 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_15_additive_0.sscore.zst
16.1 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_16_additive_0.sscore.zst
15.7 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_16_additive_0.sscore.zst
15.7 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_17_additive_0.sscore.zst
16 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_17_additive_0.sscore.zst
16 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_18_additive_0.sscore.zst
15.6 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_18_additive_0.sscore.zst
15.6 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_19_additive_0.sscore.zst
16.1 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_19_additive_0.sscore.zst
16.1 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_1_additive_0.sscore.zst
16.3 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_1_additive_0.sscore.zst
16.3 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_20_additive_0.sscore.zst
15.9 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_20_additive_0.sscore.zst
15.9 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_21_additive_0.sscore.zst
15.4 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_21_additive_0.sscore.zst
15.4 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_22_additive_0.sscore.zst
15.3 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_22_additive_0.sscore.zst
15.3 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_2_additive_0.sscore.zst
15.9 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_2_additive_0.sscore.zst
15.9 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_3_additive_0.sscore.zst
16 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_3_additive_0.sscore.zst
16 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_4_additive_0.sscore.zst
16.1 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_4_additive_0.sscore.zst
16.1 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_5_additive_0.sscore.zst
16 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_5_additive_0.sscore.zst
16 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_6_additive_0.sscore.zst
16 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_6_additive_0.sscore.zst
16 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_7_additive_0.sscore.zst
15.9 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_7_additive_0.sscore.zst
15.9 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_8_additive_0.sscore.zst
16.3 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_8_additive_0.sscore.zst
16.3 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_9_additive_0.sscore.zst
15.8 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_9_additive_0.sscore.zst
15.8 KB
diff --git a/‎pgscatalog.calcapp/tests/testdata/hgdp_X_additive_0.sscore.zst
15.6 KB b/‎pgscatalog.calcapp/tests/testdata/hgdp_X_additive_0.sscore.zst
15.6 KB
diff --git a/‎pgscatalog.calclib/src/pgscatalog/calclib/polygenicscore.py
+97-52 b/‎pgscatalog.calclib/src/pgscatalog/calclib/polygenicscore.py
+97-52
@@ -0,0 +1,13 @@
+on:
+  push:
+    paths:
+      - 'pgscatalog.calcapp/**.py'
+  pull_request:
+    paths:
+      - 'pgscatalog.calcapp/**.py'
+
+jobs:
+  calcapp-pytest:
+    uses: ./.github/workflows/pytest.yaml
+    with:
+      package-directory: "pgscatalog.calcapp"
@@ -0,0 +1,3 @@
+[virtualenvs]
+create = true
+in-project = true
@@ -0,0 +1,23 @@
+[tool.poetry]
+name = "pgscatalog-calcapp"
+version = "0.1.0"
+description = ""
+authors = ["Benjamin Wingfield <bwingfield@ebi.ac.uk>"]
+readme = "README.md"
+packages = [
+    { include = "pgscatalog", from = "src" },
+]
+
+[tool.poetry.dependencies]
+python = "^3.11"
+"pgscatalog.calclib" = {path = "../pgscatalog.calclib", develop = true}
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.0.0"
+
+[tool.poetry.scripts]
+pgscatalog-aggregate = 'pgscatalog.downloadapp.cli:run'
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,3 @@
+from .aggregate_cli import run_aggregate
+
+__all__ = ["run_aggregate"]
@@ -0,0 +1,75 @@
+import argparse
+import pathlib
+import textwrap
+import operator
+import functools
+
+from pgscatalog.calclib.polygenicscore import PolygenicScore
+
+
+def run_aggregate():
+    args = _parse_args()
+    score_paths = [pathlib.Path(x) for x in args.scores]
+    pgs = [PolygenicScore(path=x) for x in score_paths]
+    # call __add__ a lot
+    aggregated = functools.reduce(operator.add, pgs)
+    aggregated.write(outdir=args.outdir, split=args.split)
+
+
+def _description_text() -> str:
+    return textwrap.dedent(
+        """
+    Aggregate plink .sscore files into a combined TSV table.
+
+    This aggregation sums scores that were calculated from plink
+    .scorefiles. Scorefiles may be split to calculate scores over different
+    chromosomes or effect types. The PGS Catalog calculator automatically splits
+    scorefiles where appropriate, and uses this script to combine them.
+
+    Input .sscore files can be optionally compressed with zstd or gzip. 
+
+    The aggregated output scores are compressed with gzip.
+   """
+    )
+
+
+def _parse_args(args=None):
+    parser = argparse.ArgumentParser(
+        description=_description_text(),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "-s",
+        "--scores",
+        dest="scores",
+        required=True,
+        nargs="+",
+        help="<Required> List of scorefile paths. Use a wildcard (*) to select multiple files.",
+    )
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        dest="outdir",
+        required=True,
+        default="scores/",
+        help="<Required> Output directory to store downloaded files",
+    )
+    parser.add_argument(
+        "--split",
+        dest="split",
+        required=True,
+        action=argparse.BooleanOptionalAction,
+        help="Make one aggregated file per sampleset",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        dest="verbose",
+        action="store_true",
+        help="<Optional> Extra logging information",
+    )
+    return parser.parse_args(args)
+
+
+if __name__ == "__main__":
+    run_aggregate()
@@ -0,0 +1,67 @@
+import itertools
+from unittest.mock import patch
+
+from pgscatalog.calcapp.aggregate_cli import run_aggregate
+
+import pytest
+import pandas as pd
+
+
+@pytest.fixture(scope="module")
+def scorefiles(request):
+    return [str(x) for x in list(request.path.parent.glob("testdata/*.zst"))]
+
+
+def test_split_aggregate(tmp_path_factory, scorefiles):
+    """Test aggregating HGDP PGS01229"""
+    outdir = tmp_path_factory.mktemp("outdir")
+
+    args = [
+        ("pgscatalog-aggregate", "-s", *scorefiles, "--outdir", str(outdir), "--split")
+    ]
+    flargs = list(itertools.chain(*args))
+
+    with patch("sys.argv", flargs):
+        run_aggregate()
+
+    outf = list(outdir.glob("*.txt.gz"))
+    assert [x.name for x in outf] == ["hgdp_pgs.txt.gz"]
+    outdf = pd.read_csv(outf[0], sep="\t")
+    assert list(outdf.columns) == [
+        "sampleset",
+        "IID",
+        "DENOM",
+        "PGS001229_hmPOS_GRCh38_SUM",
+    ]
+    assert outdf.shape == (929, 4)
+
+
+def test_nosplit_aggregate(tmp_path_factory, scorefiles):
+    """Test aggregating HGDP PGS01229 without splitting per sampleset"""
+    outdir = tmp_path_factory.mktemp("outdir")
+
+    args = [
+        (
+            "pgscatalog-aggregate",
+            "-s",
+            *scorefiles,
+            "--outdir",
+            str(outdir),
+            "--no-split",
+        )
+    ]
+    flargs = list(itertools.chain(*args))
+
+    with patch("sys.argv", flargs):
+        run_aggregate()
+
+    outf = list(outdir.glob("*.txt.gz"))
+    assert [x.name for x in outf] == ["aggregated_scores.txt.gz"]
+    outdf = pd.read_csv(outf[0], sep="\t")
+    assert list(outdf.columns) == [
+        "sampleset",
+        "IID",
+        "DENOM",
+        "PGS001229_hmPOS_GRCh38_SUM",
+    ]
+    assert outdf.shape == (929, 4)
@@ -1,31 +1,36 @@
 import pathlib
 
 import pandas as pd
-import reprlib
 
 
 class PolygenicScore:
     """Represents the output of plink2 --score written to a file
 
     >>> from ._config import Config
     >>> score1 = Config.ROOT_DIR / "tests" / "cineca_22_additive_0.sscore.zst"
-    >>> pgs1 = PolygenicScore(sampleset="test", path=score1)  # doctest: +ELLIPSIS
+    >>> pgs1 = PolygenicScore(path=score1)  # doctest: +ELLIPSIS
     >>> pgs1
-    PolygenicScore(sampleset='test', path=PosixPath('.../cineca_22_additive_0.sscore.zst'), df=None)
-    >>> pgs2 = PolygenicScore(sampleset="test", path=score1)
-    >>> pgs1.read().to_dict()  # doctest: +ELLIPSIS
-    {'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 0.54502, ('test', 'HG00097'): 0.674401, ('test', 'HG00099'): 0.63727, ('test', 'HG00100'): 0.863944, ...}}
+    PolygenicScore(sampleset='cineca', path=PosixPath('.../cineca_22_additive_0.sscore.zst'))
+    >>> pgs2 = PolygenicScore(path=score1)
 
     It's often helpful to combine PGS that were split per chromosome or by effect type:
 
     >>> aggregated_score = pgs1 + pgs2
     >>> aggregated_score  # doctest: +ELLIPSIS
-    PolygenicScore(sampleset='test', path=None, df={'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 1.09004, ('test', 'HG00097'): 1.348802, ('test', 'HG00099'): 1.27454, ('test', 'HG00100'): 1.727888, ...}})
+    PolygenicScore(sampleset='cineca', path=None)
+
+    The backing dataframe is loaded lazily in chunks:
+
+    >>> for chunk in aggregated_score:
+    ...     chunk.to_dict()
+    ...     break
+    {'DENOM': {('cineca', 'HG00096'): 3128, ...}, 'PGS001229_22_SUM': {('cineca', 'HG00096'): 1.09004, ...}}
+
 
     Once a score has been fully aggregated it can be helpful to recalculate an average:
 
     >>> aggregated_score.average().to_dict()  # doctest: +ELLIPSIS
-    {'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 1.09004, ...}, 'PGS001229_22_AVG': {('test', 'HG00096'): 0.000348...
+    {'DENOM': ...}, 'PGS001229_22_SUM': {('cineca', 'HG00096'): 1.09004, ...}, 'PGS001229_22_AVG': {('cineca', 'HG00096'): 0.000348...
 
     Scores can be written to a TSV file:
 
@@ -40,10 +45,10 @@ class PolygenicScore:
     >>> splitoutd = tempfile.mkdtemp()
     >>> aggregated_score.write(splitoutd, split=True)
     >>> sorted(os.listdir(splitoutd), key = lambda x: x.split("_")[0])
-    ['test_pgs.txt.gz']
+    ['cineca_pgs.txt.gz']
     """
 
-    def __init__(self, *, sampleset, path=None, df=None):
+    def __init__(self, *, path=None, df=None, sampleset=None):
         match (path, df):
             case (None, None):
                 raise ValueError("init with path or df")
@@ -52,62 +57,102 @@ def __init__(self, *, sampleset, path=None, df=None):
             case _:
                 pass
 
-        self.path = path
-        self.df = df
-        self.sampleset = sampleset
+        try:
+            self.path = pathlib.Path(path)
+        except TypeError:
+            self.path = None
+
+        if sampleset is None:
+            self.sampleset = path.stem.split("_")[0]
+        else:
+            self.sampleset = sampleset
+
+        self._chunksize = 50000
+
+        if df is not None:
+            # big df is an in-memory pandas df
+            self._bigdf = df
+        else:
+            self._bigdf = None
+            self._df = None
 
     def __repr__(self):
-        if self.df is not None:
-            df = reprlib.repr(self.df.to_dict())
+        return f"{type(self).__name__}(sampleset={repr(self.sampleset)}, path={repr(self.path)})"
+
+    def __iter__(self):
+        yield from self.df
+
+    def __add__(self, other):
+        if isinstance(other, PolygenicScore):
+            dfs = []
+            for df1, df2 in zip(self, other, strict=True):
+                sumdf = df1.add(df2, fill_value=0)
+                dfs.append(sumdf)
+            return PolygenicScore(sampleset=self.sampleset, df=pd.concat(dfs, axis=0))
         else:
-            df = reprlib.repr(None)
+            return NotImplemented
 
-        return f"{type(self).__name__}(sampleset={repr(self.sampleset)}, path={repr(self.path)}, df={df})"
+    @property
+    def df(self):
+        if self.path is not None:
+            self._df = self.lazy_read()
+        elif self._bigdf is not None:
+            # a fake generator
+            self._df = (x for x in [self._bigdf])
+        return self._df
 
-    def read(self):
-        """Read a PGS file as a pandas dataframe"""
-        if self.df is None:
-            df = (
-                pd.read_table(self.path)
-                .assign(sampleset=self.sampleset)
-                .set_index(["sampleset", "#IID"])
-            )
+    def lazy_read(self):
+        """Lazily read a PGS in chunks"""
+        if self.path is None:
+            raise ValueError("Missing path")
+
+        for chunk in pd.read_csv(self.path, chunksize=self._chunksize, sep="\t"):
+            df = chunk.assign(sampleset=self.sampleset).set_index(["sampleset", "#IID"])
 
             df.index.names = ["sampleset", "IID"]
             df = df[_select_agg_cols(df.columns)]
-            self.df = df
-        return self.df
+            yield df
 
-    def average(self):
-        avgs = self.df.loc[:, self.df.columns.str.endswith("_SUM")].divide(
-            self.df["DENOM"], axis=0
+    def read(self):
+        """Eagerly load a PGS into a pandas dataframe"""
+        if self.path is None:
+            raise ValueError("Missing path")
+
+        df = (
+            pd.read_csv(self.path, sep="\t")
+            .assign(sampleset=self.sampleset)
+            .set_index(["sampleset", "#IID"])
         )
-        avgs.columns = avgs.columns.str.replace("_SUM", "_AVG")
-        self.df = pd.concat([self.df, avgs], axis=1)
-        return self.df
+
+        df.index.names = ["sampleset", "IID"]
+        df = df[_select_agg_cols(df.columns)]
+        return df
 
     def write(self, outdir, split=False):
-        """Write a PGS to a compressed TSV"""
+        """Write PGS to a compressed TSV"""
         outdir = pathlib.Path(outdir)
-        if split:
-            for sampleset, group in self.df.groupby("sampleset"):
-                fout = outdir / f"{sampleset}_pgs.txt.gz"
-                group.to_csv(fout, sep="\t", compression="gzip")
-        else:
-            fout = outdir / "aggregated_scores.txt.gz"
-            self.df.to_csv(fout, sep="\t", compression="gzip")
+        for chunk in self:
+            if split:
+                for sampleset, group in chunk.groupby("sampleset"):
+                    fout = outdir / f"{sampleset}_pgs.txt.gz"
+                    chunk.to_csv(fout, sep="\t", compression="gzip", mode="a")
+            else:
+                fout = outdir / "aggregated_scores.txt.gz"
+                chunk.to_csv(fout, sep="\t", compression="gzip", mode="a")
 
-    def __add__(self, other):
-        if isinstance(other, PolygenicScore):
-            if self.sampleset != other.sampleset:
-                raise ValueError("Can't add PolygenicScore with different samplesets")
-
-            df1 = self.read()
-            df2 = other.read()
-            sumdf = df1.add(df2, fill_value=0)
-            return PolygenicScore(sampleset=self.sampleset, df=sumdf)
-        else:
-            return NotImplemented
+    def average(self):
+        """Recalculate average.
+
+        This is an eager operation, and immediately returns a dataframe
+        """
+        chunk_list = []
+        for chunk in self:
+            avgs = chunk.loc[:, chunk.columns.str.endswith("_SUM")].divide(
+                chunk["DENOM"], axis=0
+            )
+            avgs.columns = avgs.columns.str.replace("_SUM", "_AVG")
+            chunk_list.append(pd.concat([chunk, avgs], axis=1))
+        return pd.concat(chunk_list, axis=0)
 
 
 def _select_agg_cols(cols):
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[virtualenvs]`
	`2`	`+create = true`
	`3`	`+in-project = true`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .aggregate_cli import run_aggregate`
	`2`	`+`
	`3`	`+__all__ = ["run_aggregate"]`