start adjust method

nebfield · nebfield · commit f7a75df27db6 · 2024-03-04T17:41:29.000Z
diff --git a/pgscatalog.calclib/src/pgscatalog/calclib/polygenicscore.py b/pgscatalog.calclib/src/pgscatalog/calclib/polygenicscore.py
@@ -1,19 +1,122 @@
+import logging
 import pathlib
+from collections import namedtuple
 
 import pandas as pd
 
+from .ancestry.tools import choose_pval_threshold, compare_ancestry
+from .principalcomponents import PopulationType
+from .ancestry import read
+
+
+logger = logging.getLogger(__name__)
+
+
+class AggregatedPGS:
+    """A PGS that's been aggregated and melted, and may contain multiple samplesets
+
+    >>> from ._config import Config
+    >>> score_path = Config.ROOT_DIR / "tests" / "aggregated_scores.txt.gz"
+    >>> AggregatedPGS(path=score_path, target_name="hgdp")
+    AggregatedPGS(path=PosixPath('.../pgscatalog.calclib/tests/aggregated_scores.txt.gz'))
+
+    """
+
+    def __init__(self, *, target_name, df=None, path=None):
+        if df is None and path is None:
+            raise TypeError("df or path is required")
+
+        self._path = path
+        self._df = None
+        self._target_name = target_name
+
+    @property
+    def path(self):
+        return self._path
+
+    @property
+    def target_name(self):
+        return self._target_name
+
+    @property
+    def df(self):
+        if self._df is None:
+            self._df = read.read_pgs(self._path)
+        return self._df
+
+    def __repr__(self):
+        return f"{type(self).__name__}(path={repr(self.path)})"
+
+    def _check_overlap(self, ref_pc, target_pc):
+        """Before adjusting, there should be perfect target sample overlap"""
+        pca_ref_samples = set(ref_pc.df.index.get_level_values(1))
+        pca_target_samples = set(target_pc.df.index.get_level_values(1))
+        score_ref_samples = set(self.df.loc["reference"].index)
+        score_target_samples = set(self.df.loc[self.target_name].index)
+
+        if not pca_ref_samples.issubset(score_ref_samples):
+            logger.critical(
+                "Error: PGS data missing for reference samples with PCA data"
+            )
+            raise ValueError
+
+        if not pca_target_samples.issubset(score_target_samples):
+            logger.critical("Error: PGS data missing for target samples with PCA data.")
+            raise ValueError
+
+    def adjust(self, *, ref_pc, target_pc, **kwargs):
+        """
+        >>> from ._config import Config
+        >>> ref_pc = PrincipalComponents(pcs_path=[Config.ROOT_DIR / "tests" / "ref.pcs"], dataset="reference", psam_path=Config.ROOT_DIR / "tests" / "ref.psam", pop_type=PopulationType.REFERENCE)
+        >>> target_pcs = PrincipalComponents(pcs_path=Config.ROOT_DIR / "tests" / "target.pcs", dataset="target", pop_type=PopulationType.TARGET)
+        >>> score_path = Config.ROOT_DIR / "tests" / "aggregated_scores.txt.gz"
+        >>> AggregatedPGS(path=score_path, target_name="hgdp").adjust(ref_pc=ref_pc, target_pc=target_pcs)
+        """
+        if ref_pc.pop_type != PopulationType.REFERENCE:
+            raise ValueError("ref_pc argument has bad pop_type")
+
+        if target_pc.pop_type != PopulationType.TARGET:
+            raise ValueError("target_pc argument has bad pop_type")
+
+        self._check_overlap(ref_pc=ref_pc, target_pc=target_pc)
+
+        # join pgs + pca data
+        target_df = target_pc.df.join(self.df.loc[self.target_name], on="IID")
+        reference_df = ref_pc.df.join(self.df.loc["reference"], on="IID")
+
+        # set up
+        ancestry_args = namedtuple("ancestry_args", ["method_compare", "pThreshold"])
+        args = ancestry_args(
+            kwargs.get("method_compare", "RandomForest"), kwargs.get("pThreshold", None)
+        )
+        assignment_threshold_p = choose_pval_threshold(args)
+
+        # TODO: bork
+        ancestry_ref, ancestry_target, compare_info = compare_ancestry(
+            ref_df=reference_df,
+            ref_pop_col=ref_pc.poplabel,
+            ref_train_col="Unrelated",
+            target_df=target_df,
+            n_pcs=ref_pc.npcs_popcomp,
+            method=args.method_compare,
+            p_threshold=assignment_threshold_p,
+        )
+
+        pass
+
 
 class PolygenicScore:
     """Represents the output of plink2 --score written to a file
 
     >>> from ._config import Config
+    >>> import reprlib
     >>> score1 = Config.ROOT_DIR / "tests" / "cineca_22_additive_0.sscore.zst"
     >>> pgs1 = PolygenicScore(sampleset="test", path=score1)  # doctest: +ELLIPSIS
     >>> pgs1
     PolygenicScore(sampleset='test', path=PosixPath('.../cineca_22_additive_0.sscore.zst'))
     >>> pgs2 = PolygenicScore(sampleset="test", path=score1)
-    >>> pgs1.read().to_dict()  # doctest: +ELLIPSIS
-    {'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 0.54502, ('test', 'HG00097'): 0.674401, ('test', 'HG00099'): 0.63727, ('test', 'HG00100'): 0.863944, ...}}
+    >>> reprlib.repr(pgs1.read().to_dict()) # doctest: +ELLIPSIS
+    "{'DENOM': {('test', 'HG00096'): 1564, ('test', 'HG00097'): 1564, ('test', 'HG00099'): 1564, ('test', 'HG00100'): 1564, ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 0.54502, ('test', 'HG00097'): 0.674401, ('test', 'HG00099'): 0.63727, ('test', 'HG00100'): 0.863944, ...}}"
 
     It's often helpful to combine PGS that were split per chromosome or by effect type:
 
@@ -23,8 +126,8 @@ class PolygenicScore:
 
     Once a score has been fully aggregated it can be helpful to recalculate an average:
 
-    >>> aggregated_score.average().to_dict()  # doctest: +ELLIPSIS
-    {'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 1.09004, ...}, 'PGS001229_22_AVG': {('test', 'HG00096'): 0.000348...
+    >>> reprlib.repr(aggregated_score.average().to_dict())  # doctest: +ELLIPSIS
+    "{'DENOM': {('test', 'HG00096'): 3128, ('test', 'HG00097'): 3128, ('test', 'HG00099'): 3128, ('test', 'HG00100'): 3128, ...}, 'PGS001229_22_AVG': {('test', 'HG00096'): 0.0003484782608695652, ('test', 'HG00097'): 0.00043120268542199493, ('test', 'HG00099'): 0.0004074616368286445, ('test', 'HG00100'): 0.0005523938618925831, ...}}"
 
     Scores can be written to a TSV file:
 
@@ -100,7 +203,9 @@ def lazy_read(self):
         if self.path is None:
             raise ValueError("Missing path")
 
-        for chunk in pd.read_csv(self.path, chunksize=self._chunksize, sep="\t"):
+        for chunk in pd.read_csv(
+            self.path, chunksize=self._chunksize, sep="\t", converters={"IID": str}
+        ):
             df = chunk.assign(sampleset=self.sampleset).set_index(["sampleset", "#IID"])
 
             df.index.names = ["sampleset", "IID"]
@@ -113,7 +218,7 @@ def read(self):
             raise ValueError("Missing path")
 
         df = (
-            pd.read_csv(self.path, sep="\t")
+            pd.read_csv(self.path, sep="\t", converters={"IID": str})
             .assign(sampleset=self.sampleset)
             .set_index(["sampleset", "#IID"])
         )
@@ -144,13 +249,19 @@ def average(self):
         """
         chunk_list = []
         for chunk in self:
-            avgs = chunk.loc[:, chunk.columns.str.endswith("_SUM")].divide(
-                chunk["DENOM"], axis=0
-            )
+            avgs = chunk.filter(regex="SUM$")
+            avgs = avgs.divide(chunk.DENOM, axis=0)
+            avgs.insert(0, "DENOM", chunk.DENOM)
             avgs.columns = avgs.columns.str.replace("_SUM", "_AVG")
-            chunk_list.append(pd.concat([chunk, avgs], axis=1))
+            chunk_list.append(avgs)
         return pd.concat(chunk_list, axis=0)
 
+    def melt(self):
+        """Melt dataframe from wide format to long format"""
+        sum_df = _melt(pd.concat(self.df, axis=0), value_name="SUM")
+        avg_df = _melt(self.average(), value_name="AVG")
+        return pd.concat([sum_df, avg_df.AVG], axis=1)
+
 
 def _select_agg_cols(cols):
     """Select aggregatable columns"""
@@ -163,7 +274,6 @@ def _select_agg_cols(cols):
 
 
 def _melt(df, value_name):
-    """Melt the score dataframe from wide format to long format"""
     df = df.melt(
         id_vars=["DENOM"],
         value_name=value_name,
diff --git a/pgscatalog.calclib/src/pgscatalog/calclib/principalcomponents.py b/pgscatalog.calclib/src/pgscatalog/calclib/principalcomponents.py
@@ -49,12 +49,20 @@ def __init__(self, pcs_path, dataset, pop_type, psam_path=None, **kwargs):
         self._df = None
         # File of related sample IDs (excluded from training ancestry assignments)
         self._related_ids = kwargs.get("related_id_path", None)
-        # Population labels in REFERENCE psam to use for assignment
-        self._poplabel = kwargs.get("ref_label", "SuperPop")
+
+        if self.pop_type == PopulationType.REFERENCE:
+            # Population labels in REFERENCE psam to use for assignment
+            self._poplabel = kwargs.get("ref_label", "SuperPop")
+        else:
+            self._poplabel = None
 
     def __repr__(self):
         return f"PrincipalComponents(dataset={self.dataset!r}, pop_type={self.pop_type}, pcs_path={self.pcs_path!r}, psam_path={self.psam_path!r})"
 
+    @property
+    def poplabel(self):
+        return self._poplabel
+
     @property
     def max_pcs(self):
         """The maximum number of PCs used in calculations"""
@@ -87,20 +95,20 @@ def npcs_norm(self, value):
     @property
     def df(self):
         if self._df is None:
-            self._df = read.read_pcs(
+            df = read.read_pcs(
                 loc_pcs=self.pcs_path,
                 dataset=self.dataset,
                 loc_related_ids=self._related_ids,
                 nPCs=self.max_pcs,
             )
-
-        if self.pop_type == PopulationType.REFERENCE:
-            self._df = read.extract_ref_psam_cols(
-                loc_psam=self.psam_path,
-                dataset=self.dataset,
-                df_target=self._df,
-                keepcols=self._poplabel,
-            )
+            if self.pop_type == PopulationType.REFERENCE:
+                df = read.extract_ref_psam_cols(
+                    loc_psam=self.psam_path,
+                    dataset=self.dataset,
+                    df_target=df,
+                    keepcols=self._poplabel,
+                )
+            self._df = df
 
         if self._df.shape[0] < 100 and self.pop_type == PopulationType.REFERENCE:
             logger.critical(
diff --git a/pgscatalog.calclib/tests/aggregated_scores.txt.gz b/pgscatalog.calclib/tests/aggregated_scores.txt.gz