PGScatalog · nebfield · Sep 24, 2024 · Sep 9, 2024 · Sep 13, 2024 · Sep 13, 2024
diff --git a/pgscatalog.core/src/pgscatalog/core/__init__.py b/pgscatalog.core/src/pgscatalog/core/__init__.py
@@ -10,7 +10,6 @@
     ScoringFiles,
     ScoringFile,
     NormalisedScoringFile,
-    ScoreVariant,
     GenomeBuild,
     TargetVariants,
     TargetVariant,
@@ -31,7 +30,6 @@
 __all__ = [
     "ScoringFiles",
     "ScoringFile",
-    "ScoreVariant",
     "Config",
     "GenomeBuild",
     "CatalogQuery",

diff --git a/pgscatalog.core/src/pgscatalog/core/cli/_combine.py b/pgscatalog.core/src/pgscatalog/core/cli/_combine.py
@@ -2,7 +2,6 @@
 standardised format, combining them, and calculating some statistics. Only really
 useful for the CLI, not good for importing elsewhere."""
 
-import collections
 import csv
 import functools
 import gzip
@@ -13,13 +12,6 @@
 logger = logging.getLogger(__name__)
 
 
-def get_variant_log(batch):
-    # these statistics can only be generated while iterating through variants
-    n_variants = collections.Counter("n_variants" for item in batch)
-    hm_source = collections.Counter(getattr(item, "hm_source") for item in batch)
-    return n_variants + hm_source
-
-
 class DataWriter:
     def __init__(self, filename):
         self.filename = filename

diff --git a/pgscatalog.core/src/pgscatalog/core/cli/combine_cli.py b/pgscatalog.core/src/pgscatalog/core/cli/combine_cli.py
@@ -1,14 +1,16 @@
 import argparse
-import json
 import logging
 import pathlib
 import sys
 import textwrap
+from typing import Optional
 
 from tqdm import tqdm
-from ..lib import GenomeBuild, ScoringFile, ScoreVariant, EffectTypeError
 
-from ._combine import get_variant_log, TextFileWriter
+from ..lib.models import ScoreLog, ScoreLogs, ScoreVariant
+from ..lib import GenomeBuild, ScoringFile, EffectTypeError
+
+from ._combine import TextFileWriter
 
 logger = logging.getLogger(__name__)
 
@@ -40,6 +42,12 @@ def run():
     target_build = GenomeBuild.from_string(args.target_build)
 
     for x in scoring_files:
+        if x.genome_build is None and target_build is not None:
+            raise ValueError(
+                f"Can't combine files with missing build in "
+                f"header when requesting {target_build=}"
+            )
+
         if x.genome_build != target_build and not args.liftover:
             raise ValueError(
                 f"Can't combine scoring file with genome build {x.genome_build!r} when {target_build=} without --liftover"
@@ -61,10 +69,10 @@ def run():
         liftover_kwargs = {"liftover": False}
 
     n_finished = 0
-    good_scores = []
-
     for scorefile in tqdm(scoring_files, total=len(scoring_files)):
         logger.info(f"Processing {scorefile.pgs_id}")
+        normalised_score: Optional[list[ScoreVariant]] = None
+        is_compatible = True
         try:
             normalised_score = list(
                 scorefile.normalise(
@@ -77,20 +85,27 @@ def run():
             logger.warning(
                 f"Unsupported non-additive effect types in {scorefile=}, skipping"
             )
+            is_compatible = False
             continue
         else:
             # TODO: go back to parallel execution + write to multiple files
             writer = TextFileWriter(compress=compress_output, filename=out_path)
 
             # model_dump returns a dict with a subset of keys
             dumped_variants = (
-                x.model_dump(include=ScoreVariant.output_fields)
+                x.model_dump(include=set(ScoreVariant.output_fields))
                 for x in normalised_score
             )
             writer.write(dumped_variants)
-            variant_log.append(get_variant_log(normalised_score))
             n_finished += 1
-            good_scores.append(scorefile)
+        finally:
+            variant_log.append(
+                ScoreLog(
+                    header=scorefile.header,
+                    variants=normalised_score,
+                    compatible_effect_type=is_compatible,
+                )
+            )
 
     if n_finished == 0:
         raise ValueError(
@@ -100,14 +115,10 @@ def run():
     if n_finished != len(scoring_files):
         logger.warning(f"{len(scoring_files) - n_finished} scoring files were skipped")
 
-    score_log = []
-    for sf, log in zip(good_scores, variant_log, strict=True):
-        score_log.append(sf.get_log(variant_log=log))
-
     log_out_path = pathlib.Path(args.outfile).parent / args.logfile
     with open(log_out_path, "w") as f:
         logger.info(f"Writing log to {f.name}")
-        json.dump(score_log, f, indent=4)
+        f.write(ScoreLogs(logs=variant_log).model_dump_json())
 
     logger.info("Combining complete")
 

diff --git a/pgscatalog.core/src/pgscatalog/core/lib/__init__.py b/pgscatalog.core/src/pgscatalog/core/lib/__init__.py
@@ -3,7 +3,6 @@
 from .genomebuild import GenomeBuild
 from .catalogapi import ScoreQueryResult, CatalogQuery, CatalogCategory
 from .scorefiles import ScoringFiles, ScoringFile, NormalisedScoringFile
-from .scorevariant import ScoreVariant
 from .targetvariants import TargetVariants, TargetVariant, TargetType
 from ._relabel import RelabelArgs, relabel, relabel_write
 from ._sortpaths import effect_type_keyfunc, chrom_keyfunc
@@ -51,7 +50,6 @@
     "SamplesheetFormatError",
     "ScoringFiles",
     "ScoringFile",
-    "ScoreVariant",
     "Config",
     "GenomeBuild",
     "CatalogQuery",

diff --git a/pgscatalog.core/src/pgscatalog/core/lib/_normalise.py b/pgscatalog.core/src/pgscatalog/core/lib/_normalise.py
@@ -32,15 +32,15 @@ def normalise(
     if liftover:
         variants = lift(
             scoring_file=scoring_file,
-            harmonised=scoring_file.harmonised,
+            harmonised=scoring_file.is_harmonised,
             current_build=scoring_file.genome_build,
             target_build=target_build,
             chain_dir=chain_dir,
         )
     else:
         variants = scoring_file.variants
 
-    variants = remap_harmonised(variants, scoring_file.harmonised, target_build)
+    variants = remap_harmonised(variants, scoring_file.is_harmonised, target_build)
 
     if drop_missing:
         variants = drop_hla(variants)
@@ -100,7 +100,7 @@ def check_duplicates(variants):
 def drop_hla(variants):
     """Drop HLA alleles from a list of ScoreVariants
 
-    >>> from .scorevariant import ScoreVariant
+    >>> from .models import ScoreVariant
     >>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "chr_name": "1", "chr_position": 1})
     >>> list(drop_hla([variant])) # doctest: +ELLIPSIS
     [ScoreVariant(..., effect_allele=Allele(allele='A', is_snp=True), ...
@@ -127,7 +127,7 @@ def drop_hla(variants):
 def assign_other_allele(variants):
     """Check if there's more than one possible other allele, remove if true
 
-    >>> from .scorevariant import ScoreVariant
+    >>> from .models import ScoreVariant
     >>> variant = ScoreVariant(**{"chr_position": 1, "rsID": None, "chr_name": "1", "effect_allele": "A", "effect_weight": 5, "other_allele": "A", "row_nr": 0, "accession": "test"})
     >>> list(assign_other_allele([variant]))[0] # doctest: +ELLIPSIS
     ScoreVariant(..., effect_allele=Allele(allele='A', is_snp=True), other_allele=Allele(allele='A', is_snp=True), ...)
@@ -154,7 +154,7 @@ def remap_harmonised(variants, harmonised, target_build):
     In this case chr_name, chr_position, and other allele are missing.
     Perhaps authors submitted rsID and effect allele originally:
 
-    >>> from .scorevariant import ScoreVariant
+    >>> from .models import ScoreVariant
     >>> variant = ScoreVariant(**{"chr_position": 1, "rsID": None, "chr_name": "2", "effect_allele": "A", "effect_weight": 5, "accession": "test", "hm_chr": "1", "hm_pos": 100, "hm_rsID": "testrsid", "hm_inferOtherAllele": "A", "row_nr": 0})
     >>> variant
     ScoreVariant(..., effect_allele=Allele(allele='A', is_snp=True), other_allele=None, ...
@@ -184,7 +184,7 @@ def check_effect_allele(variants, drop_missing=False):
     """
     Odd effect allele:
 
-    >>> from .scorevariant import ScoreVariant
+    >>> from .models import ScoreVariant
     >>> variant = ScoreVariant(**{"effect_allele": "Z", "effect_weight": 5, "accession": "test", "row_nr": 0, "chr_name": "1", "chr_position": 1})
     >>> list(check_effect_allele([variant], drop_missing=True)) # doctest: +ELLIPSIS
     []

diff --git a/pgscatalog.core/src/pgscatalog/core/lib/_read.py b/pgscatalog.core/src/pgscatalog/core/lib/_read.py
@@ -2,11 +2,10 @@
 These functions aren't really meant to be imported outside corelib"""
 
 import logging
-import pathlib
 
 from xopen import xopen
 
-from .scorevariant import ScoreVariant
+from .models import ScoreVariant
 
 logger = logging.getLogger(__name__)
 
@@ -37,23 +36,6 @@ def read_rows_lazy(
         row_nr += 1
 
 
-def generate_header_lines(f):
-    """Header lines in a PGS Catalog scoring file are structured like:
-
-    #pgs_id=PGS000348
-    #pgs_name=PRS_PrCa
-
-    Files can be big, so we want to only read header lines and stop immediately
-    """
-    for line in f:
-        if line.startswith("#"):
-            if "=" in line:
-                yield line.strip()
-        else:
-            # stop reading lines
-            break
-
-
 def get_columns(path):
     """Grab column labels from a PGS Catalog scoring file. line_no is useful to skip the header"""
     with xopen(path, mode="rt") as f:
@@ -79,17 +61,3 @@ def detect_wide(cols: list[str]) -> bool:
         return True
     else:
         return False
-
-
-def read_header(path: pathlib.Path):
-    """Parses the header of a PGS Catalog format scoring file into a dictionary"""
-    header = {}
-
-    with xopen(path, "rt") as f:
-        header_text = generate_header_lines(f)
-
-        for item in header_text:
-            key, value = item.split("=")
-            header[key[1:]] = value  # drop # character from key
-
-    return header