PGScatalog
diff --git a/‎pgscatalog.calclib/poetry.lock
-193 b/‎pgscatalog.calclib/poetry.lock
-193
diff --git a/‎pgscatalog.calclib/src/pgscatalog/calclib/__init__.py
+6 b/‎pgscatalog.calclib/src/pgscatalog/calclib/__init__.py
+6
diff --git a/‎pgscatalog.calclib/src/pgscatalog/calclib/polygenicscore.py
+13-16 b/‎pgscatalog.calclib/src/pgscatalog/calclib/polygenicscore.py
+13-16
diff --git a/‎pgscatalog.combineapp/src/pgscatalog/combineapp/cli.py
+9-9 b/‎pgscatalog.combineapp/src/pgscatalog/combineapp/cli.py
+9-9
diff --git a/‎pgscatalog.corelib/src/pgscatalog/corelib/__init__.py
+5 b/‎pgscatalog.corelib/src/pgscatalog/corelib/__init__.py
+5
diff --git a/‎pgscatalog.corelib/src/pgscatalog/corelib/_download.py
+4 b/‎pgscatalog.corelib/src/pgscatalog/corelib/_download.py
+4
diff --git a/‎pgscatalog.corelib/src/pgscatalog/corelib/_normalise.py
+12-6 b/‎pgscatalog.corelib/src/pgscatalog/corelib/_normalise.py
+12-6
diff --git a/‎pgscatalog.corelib/src/pgscatalog/corelib/catalogapi.py
+20-7 b/‎pgscatalog.corelib/src/pgscatalog/corelib/catalogapi.py
+20-7
diff --git a/‎pgscatalog.corelib/src/pgscatalog/corelib/scorefiles.py
+33 b/‎pgscatalog.corelib/src/pgscatalog/corelib/scorefiles.py
+33
@@ -1,3 +1,9 @@
+import logging
+
 from .polygenicscore import PolygenicScore
 
+log_fmt = "%(name)s: %(asctime)s %(levelname)-8s %(message)s"
+logging.basicConfig(format=log_fmt, datefmt="%Y-%m-%d %H:%M:%S")
+logger = logging.getLogger(__name__)
+
 __all__ = ["PolygenicScore"]
@@ -2,35 +2,32 @@
 
 import pandas as pd
 
+import reprlib
+
+
 
 class PolygenicScore:
     """Represents the output of plink2 --score written to a file
 
     >>> from ._config import Config
     >>> score1 = Config.ROOT_DIR / "tests" / "cineca_22_additive_0.sscore.zst"
-    >>> pgs1 = PolygenicScore(path=score1)  # doctest: +ELLIPSIS
+    >>> pgs1 = PolygenicScore(sampleset="test", path=score1)  # doctest: +ELLIPSIS
     >>> pgs1
-    PolygenicScore(sampleset='cineca', path=PosixPath('.../cineca_22_additive_0.sscore.zst'))
-    >>> pgs2 = PolygenicScore(path=score1)
+    PolygenicScore(sampleset='test', path=PosixPath('.../cineca_22_additive_0.sscore.zst'), df=None)
+    >>> pgs2 = PolygenicScore(sampleset="test", path=score1)
+    >>> pgs1.read().to_dict()  # doctest: +ELLIPSIS
+    {'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 0.54502, ('test', 'HG00097'): 0.674401, ('test', 'HG00099'): 0.63727, ('test', 'HG00100'): 0.863944, ...}}
 
     It's often helpful to combine PGS that were split per chromosome or by effect type:
 
     >>> aggregated_score = pgs1 + pgs2
     >>> aggregated_score  # doctest: +ELLIPSIS
-    PolygenicScore(sampleset='cineca', path=None)
-
-    The backing dataframe is loaded lazily in chunks:
-
-    >>> for chunk in aggregated_score:
-    ...     chunk.to_dict()
-    ...     break
-    {'DENOM': {('cineca', 'HG00096'): 3128, ...}, 'PGS001229_22_SUM': {('cineca', 'HG00096'): 1.09004, ...}}
-
+    PolygenicScore(sampleset='test', path=None, df={'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 1.09004, ('test', 'HG00097'): 1.348802, ('test', 'HG00099'): 1.27454, ('test', 'HG00100'): 1.727888, ...}})
 
     Once a score has been fully aggregated it can be helpful to recalculate an average:
 
     >>> aggregated_score.average().to_dict()  # doctest: +ELLIPSIS
-    {'DENOM': ...}, 'PGS001229_22_SUM': {('cineca', 'HG00096'): 1.09004, ...}, 'PGS001229_22_AVG': {('cineca', 'HG00096'): 0.000348...
+    {'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 1.09004, ...}, 'PGS001229_22_AVG': {('test', 'HG00096'): 0.000348...
 
     Scores can be written to a TSV file:
 
@@ -45,10 +42,10 @@ class PolygenicScore:
     >>> splitoutd = tempfile.mkdtemp()
     >>> aggregated_score.write(splitoutd, split=True)
     >>> sorted(os.listdir(splitoutd), key = lambda x: x.split("_")[0])
-    ['cineca_pgs.txt.gz']
+    ['test_pgs.txt.gz']
     """
 
-    def __init__(self, *, path=None, df=None, sampleset=None):
+    def __init__(self, *, sampleset, path=None, df=None):
         match (path, df):
             case (None, None):
                 raise ValueError("init with path or df")
@@ -167,7 +164,6 @@ def _select_agg_cols(cols):
         if (x.endswith("_SUM") and (x != "NAMED_ALLELE_DOSAGE_SUM")) or (x in keep_cols)
     ]
 
-
 def _melt(df, value_name):
     """Melt the score dataframe from wide format to long format"""
     df = df.melt(
@@ -179,3 +175,4 @@ def _melt(df, value_name):
     # e.g. PGS000822_SUM -> PGS000822
     df["PGS"] = df["PGS"].str.replace(f"_{value_name}", "")
     return df
+
@@ -15,26 +15,25 @@
 
 
 def run():
-    logging.basicConfig(
-        format="%(asctime)s %(name)s %(levelname)-8s %(message)s",
-        level=logging.WARNING,
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
-
     args = parse_args()
 
     if args.verbose:
-        logger.setLevel(logging.INFO)
+        logging.getLogger("pgscatalog.corelib").setLevel(logging.DEBUG)
+        logger.setLevel(logging.DEBUG)
+        logger.debug("Verbose logging enabled")
 
     out_path = pathlib.Path(args.outfile)
 
     if out_path.exists():
-        raise FileExistsError(f"{args.outfile}")
+        logger.critical(f"Output file already exists: {args.outfile}")
+        raise FileExistsError
 
     match x := out_path.name:
         case _ if x.endswith("gz"):
+            logger.debug("Compressing output with gzip")
             compress_output = True
         case _:
+            logger.debug("Not compressing output")
             compress_output = False
 
     paths = list(set(args.scorefiles))  # unique paths only
@@ -52,6 +51,7 @@ def run():
     if args.liftover:
         chain_dir = pathlib.Path(args.chain_dir)
         if not chain_dir.exists():
+            logger.critical(f"Chain directory is missing: {chain_dir}")
             raise FileNotFoundError
 
         liftover_kwargs = {
@@ -65,7 +65,7 @@ def run():
     with concurrent.futures.ThreadPoolExecutor() as executor:
         futures = []
         for scorefile in scoring_files:
-            logger.info(f"Submitting {scorefile!r}")
+            logger.info(f"Submitting {scorefile!r} for execution")
             futures.append(
                 executor.submit(
                     normalise,
 
@@ -1,3 +1,5 @@
+import logging
+
 from ._config import Config
 from .catalogapi import ScoreQueryResult, CatalogQuery, CatalogCategory
 from .scorefiles import ScoringFiles, ScoringFile, NormalisedScoringFile
@@ -24,6 +26,9 @@
     SamplesheetFormatError,
 )
 
+log_fmt = "%(name)s: %(asctime)s %(levelname)-8s %(message)s"
+logging.basicConfig(format=log_fmt, datefmt="%Y-%m-%d %H:%M:%S")
+logger = logging.getLogger(__name__)
 
 __all__ = [
     "BasePGSException",
 
@@ -27,6 +27,8 @@ def score_download_failed(retry_state):
         raise ScoreChecksumError("All checksum retries failed") from e
     except Exception as download_exc:
         raise ScoreDownloadError("All download retries failed") from download_exc
+    finally:
+        logger.critical(f"Score download failed after all retries: {retry_state!r}")
 
 
 @tenacity.retry(
@@ -81,6 +83,7 @@ def ftp_fallback(retry_state):
     else:
         # no exceptions thrown, move the temporary file to the final output path
         os.rename(score_f.name, out_path)
+        logger.info(f"FTP download OK, {out_path} checksum validation passed")
 
 
 @tenacity.retry(
@@ -124,3 +127,4 @@ def https_download(*, url, out_path, directory, overwrite):
     else:
         # no exceptions thrown, move the temporary file to the final output path
         os.rename(f.name, out_path)
+        logger.info(f"HTTPS download OK, {out_path} checksum validation passed")
@@ -24,6 +24,10 @@ def normalise(
     2. remap harmonised data, failed harmonisations get None'd
     3. log and optionally drop bad variants
     """
+    logger.info(
+        f"Normalise parameters: {drop_missing=}, {liftover=}, {chain_dir=}, {target_build=}"
+    )
+
     if liftover:
         variants = lift(
             scoring_file=scoring_file,
@@ -169,23 +173,25 @@ def assign_effect_type(variants):
 
     The most common type of effect type is additive:
 
-    >>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "FALSE", "is_dominant": "FALSE"})
+    >>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "False", "is_dominant": "False"})
     >>> list(assign_effect_type([variant])) # doctest: +ELLIPSIS
     [ScoreVariant(...,effect_type=EffectType.ADDITIVE,...)]
-    >>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "TRUE", "is_dominant": "FALSE"})
+    >>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "True", "is_dominant": "False"})
     >>> list(assign_effect_type([variant])) # doctest: +ELLIPSIS
     [ScoreVariant(...,effect_type=EffectType.RECESSIVE,...)]
-    >>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "FALSE", "is_dominant": "TRUE"})
+    >>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "False", "is_dominant": "True"})
     >>> list(assign_effect_type([variant])) # doctest: +ELLIPSIS
     [ScoreVariant(...,effect_type=EffectType.DOMINANT,...)]
+
+    is_recessive and is_dominant fields are parsed from strings to bools during __init__.
     """
     for variant in variants:
         match (variant.is_recessive, variant.is_dominant):
-            case (None, None) | ("FALSE", "FALSE"):
+            case (None, None) | (False, False):
                 pass  # default value is additive, pass to break match and yield
-            case ("FALSE", "TRUE"):
+            case (False, True):
                 variant.effect_type = EffectType.DOMINANT
-            case ("TRUE", "FALSE"):
+            case (True, False):
                 variant.effect_type = EffectType.RECESSIVE
             case _:
                 logger.critical(f"Bad effect type setting: {variant}")
 
@@ -1,5 +1,6 @@
 """ Classes and functions related to the PGS Catalog API """
 import enum
+import logging
 
 import httpx
 import tenacity
@@ -9,6 +10,9 @@
 from ._config import Config
 
 
+logger = logging.getLogger(__name__)
+
+
 class CatalogCategory(enum.Enum):
     """The three main categories in the PGS Catalog
 
@@ -121,6 +125,7 @@ def infer_category(self):
             case _:
                 raise InvalidAccessionError(f"Invalid accession: {accession!r}")
 
+        logger.debug(f"{accession=} {category=}")
         return category
 
     def get_query_url(self):
@@ -153,14 +158,14 @@ def get_query_url(self):
 
         Child traits terms aren't included by default. Only traits can have children.
         """
+        urls: list[str] | str = []
         match (self.category, self.accession):
             case CatalogCategory.TRAIT, str():
                 child_flag: int = int(self.include_children)
-                return f"{self._rest_url_root}/trait/{self.accession}?include_children={child_flag}"
+                urls = f"{self._rest_url_root}/trait/{self.accession}?include_children={child_flag}"
             case CatalogCategory.SCORE, str():
-                return [f"{self._rest_url_root}/score/search?pgs_ids={self.accession}"]
+                urls = [f"{self._rest_url_root}/score/search?pgs_ids={self.accession}"]
             case CatalogCategory.SCORE, list():
-                urls: list[str] = []
                 for chunk in self._chunk_accessions():
                     chunked_accession = ",".join(chunk)
                     urls.append(
@@ -169,12 +174,14 @@ def get_query_url(self):
                     )
                 return urls
             case CatalogCategory.PUBLICATION, str():
-                return f"{self._rest_url_root}/publication/{self.accession}"
+                urls = f"{self._rest_url_root}/publication/{self.accession}"
             case _:
                 raise ValueError(
                     f"Invalid CatalogCategory and accession type: {self.category!r}, "
                     f"type({self.accession!r})"
                 )
+        logger.debug(f"Resolved API query URL: {urls}")
+        return urls
 
     def _chunk_accessions(self):
         size = 50  # /rest/score/{pgs_id} limit when searching multiple IDs
@@ -323,10 +330,16 @@ def get_download_url(self, genome_build=None):
         """
         match build := genome_build:
             case GenomeBuild() if build == GenomeBuild.GRCh37:
-                return self.ftp_grch37_url
+                url = self.ftp_grch37_url
             case GenomeBuild() if build == GenomeBuild.GRCh38:
-                return self.ftp_grch38_url
+                url = self.ftp_grch38_url
             case None:
-                return self.ftp_url
+                url = self.ftp_url
             case _:
                 raise ValueError(f"Invalid genome build {build!r}")
+
+        logger.debug(
+            f"Scoring file download URL for {self.pgs_id} with {build=}: {url}"
+        )
+        return url
+
@@ -259,6 +259,8 @@ def __init__(self, identifier, target_build=None, query_result=None, **kwargs):
             self._directory = self.local_path.parent
 
     def _init_from_accession(self, accession, target_build):
+        logger.debug("Instantiating ScoringFile from accession")
+
         match self._identifier:
             case ScoreQueryResult():
                 # skip hitting the API unnecessarily
@@ -288,6 +290,8 @@ def _init_from_accession(self, accession, target_build):
         self.local_path = None
 
     def _init_from_path(self, target_build=None):
+        logger.debug(f"Instantiating ScoringFile from {self.local_path=}")
+  
         if target_build is not None:
             raise ValueError(
                 "target_build must be None for local files. "
@@ -395,6 +399,9 @@ def download(self, directory, overwrite=False):
         self._directory = pathlib.Path(directory)
         fn = pathlib.Path(self.path).name
         out_path = self._directory / fn
+
+        logger.debug(f"Downloading {self.path} to {out_path}")
+
         https_download(
             url=self.path,
             out_path=out_path,
@@ -467,6 +474,19 @@ def normalise(
         ('rs78540526', '11', 69701882)
 
         A :class:`LiftoverError` is only raised when many converted coordinates are missing.
+
+        Normalising converts the is_dominant and is_recessive optional fields in
+        scoring files into an EffectType:
+
+        >>> testpath = Config.ROOT_DIR / "tests" / "PGS000802_hmPOS_GRCh37.txt"
+        >>> variants = ScoringFile(testpath).normalise()
+        >>> for i, x in enumerate(variants): # doctest: +ELLIPSIS
+        ...     (x.is_dominant, x.is_recessive, x.effect_type)
+        ...     if i == 2:
+        ...         break
+        (True, False, EffectType.DOMINANT)
+        (False, True, EffectType.RECESSIVE)
+        (True, False, EffectType.DOMINANT)
         """
         yield from normalise(
             scoring_file=self,
@@ -478,6 +498,9 @@ def normalise(
 
     def get_log(self, drop_missing=False, variant_log=None):
         """Create a JSON log from a ScoringFile's header and variant rows."""
+
+        logger.debug(f"Creating JSON log for {self!r}")
+
         log = {}
 
         for attr in self._header.fields:
@@ -630,6 +653,7 @@ def __init__(self, *args, target_build=None, **kwargs):
         for arg in flargs:
             match arg:
                 case ScoringFile() if arg.target_build == target_build:
+                    logger.info("ScoringFile build matches target build")
                     scorefiles.append(arg)
                 case ScoringFile() if arg.target_build != target_build:
                     raise ValueError(
@@ -638,11 +662,18 @@ def __init__(self, *args, target_build=None, **kwargs):
                 case _ if pathlib.Path(arg).is_file() and target_build is None:
                     scorefiles.append(ScoringFile(arg))
                 case _ if pathlib.Path(arg).is_file() and target_build is not None:
+                    logger.info(f"Local path: {arg}, no target build is OK")
+                    scorefiles.append(ScoringFile(arg))
+                case _ if pathlib.Path(arg).is_file() and target_build is not None:
+                    logger.critical(f"{arg} is a local file and {target_build=}")
                     raise ValueError(
                         "Can't load local scoring file when target_build is set"
                         "Try .normalise() method to do liftover, or load harmonised scoring files from PGS Catalog"
                     )
                 case str() if arg.startswith("PGP") or "_" in arg:
+                    logger.info(
+                        "Term associated with multiple scores detected (PGP or trait)"
+                    )
                     self.include_children = kwargs.get("include_children", None)
                     traitpub_query = CatalogQuery(
                         accession=arg, include_children=self.include_children
@@ -656,6 +687,7 @@ def __init__(self, *args, target_build=None, **kwargs):
                         ]
                     )
                 case str() if arg.startswith("PGS"):
+                    logger.info("PGS ID detected")
                     pgs_batch.append(arg)
                 case str():
                     raise ValueError(f"{arg!r} is not a valid path or an accession")
@@ -664,6 +696,7 @@ def __init__(self, *args, target_build=None, **kwargs):
 
         # batch PGS IDs to avoid overloading the API
         batched_queries = CatalogQuery(accession=pgs_batch).score_query()
+        logger.debug(f"Batching queries to PGS Catalog API: {pgs_batch}")
         batched_scores = [
             ScoringFile(x, target_build=target_build) for x in batched_queries
         ]