check log counts

nebfield · nebfield · commit d700be2ed186 · 2024-02-07T13:25:20.000Z
diff --git a/pgscatalog.matchlib/src/pgscatalog/matchlib/_match/label.py b/pgscatalog.matchlib/src/pgscatalog/matchlib/_match/label.py
@@ -302,7 +302,7 @@ def _label_flips(df: pl.LazyFrame, skip_flip: bool) -> pl.LazyFrame:
         )
     else:
         logger.debug("Not excluding flipped matches")
-        return df
+        return df.with_columns(match_IDs=pl.lit("NA"))
 
 
 def _label_filter(df: pl.LazyFrame, filter_IDs: list) -> pl.LazyFrame:
diff --git a/pgscatalog.matchlib/src/pgscatalog/matchlib/_match/log.py b/pgscatalog.matchlib/src/pgscatalog/matchlib/_match/log.py
@@ -21,53 +21,53 @@ def make_summary_log(
     filter_summary: pl.LazyFrame,
     dataset: str,
 ) -> pl.LazyFrame:
-    """Make an aggregated table"""
+    """Make an aggregated table that contains the best match candidate for each row in the original scoring file"""
     logger.debug("Aggregating best matches into a summary table")
+    cols = [
+        "dataset",
+        "accession",
+        "ambiguous",
+        "is_multiallelic",
+        "match_flipped",
+        "duplicate_best_match",
+        "duplicate_ID",
+        "match_IDs",
+        "match_status",
+    ]
+
     best_matches: pl.LazyFrame = match_candidates.filter(pl.col("best_match"))
+
     return (
         scorefile.join(best_matches, on=["row_nr", "accession"], how="outer")
-        .select(pl.exclude("^.*_right$"))
         .with_columns(
-            [
-                pl.col("match_status").fill_null(value="unmatched"),
-                pl.lit(dataset).alias("dataset"),
-            ]
+            pl.col("match_status").fill_null(value="unmatched"), dataset=pl.lit(dataset)
         )  # fill in unmatched variants
-        .groupby(
-            [
-                "dataset",
-                "accession",
-                "ambiguous",
-                "is_multiallelic",
-                "match_flipped",
-                "duplicate_best_match",
-                "duplicate_ID",
-                "match_IDs",
-                "match_status",
-            ]
-        )
-        .agg(pl.all().len())
+        .group_by(cols)
+        .count()
         .join(filter_summary, how="left", on="accession")
         .pipe(_prettify_summary)
     )
 
 
-def check_log_count(scorefile: pl.LazyFrame, summary_log: pl.LazyFrame) -> None:
+def check_log_count(scorefile: pl.LazyFrame, summary_log: pl.LazyFrame):
     """Check aggregated counts vs original from scoring file"""
-    summary_count: pl.DataFrame = (
-        summary_log.groupby(pl.col("accession")).agg(pl.sum("count"))
-    ).collect()
+    summary_count: pl.LazyFrame = summary_log.group_by("accession").agg(pl.sum("count"))
+
     log_count: pl.DataFrame = (
-        scorefile.groupby("accession")
-        .agg(pl.all().len())
-        .collect()
+        scorefile.group_by("accession")
+        .count()
         .join(summary_count, on="accession")
+        .collect()
     )
 
-    assert (
-        log_count.get_column("count") == log_count.get_column("count_right")
-    ).all(), "Log doesn't match input scoring file"
-    logger.debug("Log matches input scoring file")
+    for row in log_count.iter_rows():
+        if (log_count := row[1]) != (score_count := row[2]):
+            logger.critical("Variant log doesn't match input scoring file counts")
+            raise ValueError(
+                f"{row[0]} match failure {log_count=} doesn't match {score_count=}"
+            )
+
+    return True
 
 
 def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame:
@@ -118,10 +118,8 @@ def _prettify_log(df: pl.LazyFrame) -> pl.LazyFrame:
         "match_status",
         "dataset",
     ]
-    pretty_df = (
-        df.select(keep_cols)
-        .select(pl.exclude("^.*_right"))
-        .sort(["accession", "row_nr", "chr_name", "chr_position", "match_status"])
+    pretty_df = df.select(keep_cols).sort(
+        ["accession", "row_nr", "chr_name", "chr_position", "match_status"]
     )
     return pretty_df
 
diff --git a/pgscatalog.matchlib/src/pgscatalog/matchlib/matchresult.py b/pgscatalog.matchlib/src/pgscatalog/matchlib/matchresult.py
@@ -8,6 +8,7 @@
 from ._plinkframe import PlinkFrames
 from ._match.label import label_matches
 from ._match.filter import filter_scores
+from ._match.log import make_logs, check_log_count, make_summary_log
 
 logger = logging.getLogger(__name__)
 
@@ -136,6 +137,22 @@ class MatchResults(collections.abc.Sequence):
     >>> sum("additive" in f for f in scorefiles)
     19
     >>> assert len(scorefiles) == 21
+
+    An important part of matching variants is reporting a log:
+    >>> with scorefile as score_df:
+    ...     MatchResults(x).full_variant_log(score_df).fetch(2)
+    shape: (3, 23)
+    ┌────────┬───────────┬──────────┬────────────┬───┬────────────┬───────────┬────────────┬───────────┐
+    │ row_nr ┆ accession ┆ chr_name ┆ chr_positi ┆ … ┆ duplicate_ ┆ match_IDs ┆ match_stat ┆ dataset   │
+    │ ---    ┆ ---       ┆ ---      ┆ on         ┆   ┆ ID         ┆ ---       ┆ us         ┆ ---       │
+    │ u64    ┆ cat       ┆ cat      ┆ ---        ┆   ┆ ---        ┆ null      ┆ ---        ┆ cat       │
+    │        ┆           ┆          ┆ u64        ┆   ┆ bool       ┆           ┆ cat        ┆           │
+    ╞════════╪═══════════╪══════════╪════════════╪═══╪════════════╪═══════════╪════════════╪═══════════╡
+    │ null   ┆ null      ┆ null     ┆ null       ┆ … ┆ false      ┆ null      ┆ matched    ┆ goodmatch │
+    │ 0      ┆ PGS000002 ┆ 11       ┆ 69331418   ┆ … ┆ false      ┆ null      ┆ matched    ┆ goodmatch │
+    │ 77     ┆ PGS000002 ┆ 11       ┆ 69331418   ┆ … ┆ null       ┆ null      ┆ unmatched  ┆ goodmatch │
+    └────────┴───────────┴──────────┴────────────┴───┴────────────┴───────────┴────────────┴───────────┘
+
     """
 
     def __init__(self, *elements):
@@ -149,7 +166,14 @@ def __init__(self, *elements):
         self.dataset = self._elements[0].dataset
         # a df composed of all match result elements
         self.df = pl.scan_ipc(x.ipc_path for x in self._elements)
+        # a summary table containing match rates
+        self.filter_summary = None
+        # have match candidates in df been labelled?
         self._labelled = False
+        # have match candidates in df been filtered?
+        self._filtered = False
+        # does the input scoring file count match the variant log count?
+        self._log_OK = None
 
     def __len__(self):
         return len(self._elements)
@@ -189,24 +213,58 @@ def write_scorefiles(self, directory, score_df, split=False, **kwargs):
         if not self._labelled:
             self.df = self.label(**kwargs)
 
-        self.df, self.score_summary = filter_scores(
+        self.df, self.filter_summary = filter_scores(
             scorefile=score_df,
             matches=self.df,
             min_overlap=kwargs.get("min_overlap", 0.75),
             dataset=self.dataset,
         )
-        self.filtered = True
+        self._filtered = True
 
-        if self.score_summary.is_empty():
+        # a filter summary contains match rates for each accession, and a column
+        # indicating if the score passes the minimum matching threshold
+        if self.filter_summary.is_empty():
             # can happen if min_overlap = 0
             raise ZeroMatchesError(
                 "Error: no target variants match any variants in scoring files"
             )
 
+        # a summary log contains up to one variant (the best match) for each variant
+        # in the scoring file
+        self.summary_log = make_summary_log(
+            match_candidates=self.df,
+            dataset=self.dataset,
+            filter_summary=self.filter_summary.lazy(),
+            scorefile=score_df,
+        )
+
+        # double check log count vs scoring file variant count
+        self._log_OK = check_log_count(scorefile=score_df, summary_log=self.summary_log)
+
         plink = PlinkFrames.from_matchresult(self.df)
 
         for frame in plink:
             frame.write(directory=directory, split=split, dataset=self.dataset)
 
-    def full_variant_log(self):
-        raise NotImplementedError
+    def full_variant_log(self, score_df, **kwargs):
+        """Generate a log for each variant in a scoring file
+
+        Multiple match candidates may exist for each variant in the original file.
+        Describe each variant (one variant per row) with match metadata
+        """
+        if not self._labelled:
+            self.df = self.label(**kwargs)
+            self._labelled = True
+
+        if not self._filtered:
+            self.df, self.filter_summary = filter_scores(
+                scorefile=score_df,
+                matches=self.df,
+                min_overlap=kwargs.get("min_overlap", 0.75),
+                dataset=self.dataset,
+            )
+            self._filtered = True
+
+        return make_logs(
+            scorefile=score_df, dataset=self.dataset, match_candidates=self.df
+        )

Original file line number	Diff line number	Diff line change
`@@ -302,7 +302,7 @@ def _label_flips(df: pl.LazyFrame, skip_flip: bool) -> pl.LazyFrame:`
`302`	`302`	`)`
`303`	`303`	`else:`
`304`	`304`	`logger.debug("Not excluding flipped matches")`
`305`		`- return df`
	`305`	`+ return df.with_columns(match_IDs=pl.lit("NA"))`
`306`	`306`
`307`	`307`
`308`	`308`	`def _label_filter(df: pl.LazyFrame, filter_IDs: list) -> pl.LazyFrame:`