Skip to content

Commit 3647dc1

Browse files
authored
Merge branch 'dev' into ancestry
2 parents f356578 + 40b7efe commit 3647dc1

File tree

15 files changed

+187
-242
lines changed

15 files changed

+187
-242
lines changed

pgscatalog.calclib/poetry.lock

-193
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
import logging
2+
13
from .polygenicscore import PolygenicScore
24

5+
log_fmt = "%(name)s: %(asctime)s %(levelname)-8s %(message)s"
6+
logging.basicConfig(format=log_fmt, datefmt="%Y-%m-%d %H:%M:%S")
7+
logger = logging.getLogger(__name__)
8+
39
__all__ = ["PolygenicScore"]

pgscatalog.calclib/src/pgscatalog/calclib/polygenicscore.py

+13-16
Original file line numberDiff line numberDiff line change
@@ -2,35 +2,32 @@
22

33
import pandas as pd
44

5+
import reprlib
6+
7+
58

69
class PolygenicScore:
710
"""Represents the output of plink2 --score written to a file
811
912
>>> from ._config import Config
1013
>>> score1 = Config.ROOT_DIR / "tests" / "cineca_22_additive_0.sscore.zst"
11-
>>> pgs1 = PolygenicScore(path=score1) # doctest: +ELLIPSIS
14+
>>> pgs1 = PolygenicScore(sampleset="test", path=score1) # doctest: +ELLIPSIS
1215
>>> pgs1
13-
PolygenicScore(sampleset='cineca', path=PosixPath('.../cineca_22_additive_0.sscore.zst'))
14-
>>> pgs2 = PolygenicScore(path=score1)
16+
PolygenicScore(sampleset='test', path=PosixPath('.../cineca_22_additive_0.sscore.zst'), df=None)
17+
>>> pgs2 = PolygenicScore(sampleset="test", path=score1)
18+
>>> pgs1.read().to_dict() # doctest: +ELLIPSIS
19+
{'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 0.54502, ('test', 'HG00097'): 0.674401, ('test', 'HG00099'): 0.63727, ('test', 'HG00100'): 0.863944, ...}}
1520
1621
It's often helpful to combine PGS that were split per chromosome or by effect type:
1722
1823
>>> aggregated_score = pgs1 + pgs2
1924
>>> aggregated_score # doctest: +ELLIPSIS
20-
PolygenicScore(sampleset='cineca', path=None)
21-
22-
The backing dataframe is loaded lazily in chunks:
23-
24-
>>> for chunk in aggregated_score:
25-
... chunk.to_dict()
26-
... break
27-
{'DENOM': {('cineca', 'HG00096'): 3128, ...}, 'PGS001229_22_SUM': {('cineca', 'HG00096'): 1.09004, ...}}
28-
25+
PolygenicScore(sampleset='test', path=None, df={'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 1.09004, ('test', 'HG00097'): 1.348802, ('test', 'HG00099'): 1.27454, ('test', 'HG00100'): 1.727888, ...}})
2926
3027
Once a score has been fully aggregated it can be helpful to recalculate an average:
3128
3229
>>> aggregated_score.average().to_dict() # doctest: +ELLIPSIS
33-
{'DENOM': ...}, 'PGS001229_22_SUM': {('cineca', 'HG00096'): 1.09004, ...}, 'PGS001229_22_AVG': {('cineca', 'HG00096'): 0.000348...
30+
{'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 1.09004, ...}, 'PGS001229_22_AVG': {('test', 'HG00096'): 0.000348...
3431
3532
Scores can be written to a TSV file:
3633
@@ -45,10 +42,10 @@ class PolygenicScore:
4542
>>> splitoutd = tempfile.mkdtemp()
4643
>>> aggregated_score.write(splitoutd, split=True)
4744
>>> sorted(os.listdir(splitoutd), key = lambda x: x.split("_")[0])
48-
['cineca_pgs.txt.gz']
45+
['test_pgs.txt.gz']
4946
"""
5047

51-
def __init__(self, *, path=None, df=None, sampleset=None):
48+
def __init__(self, *, sampleset, path=None, df=None):
5249
match (path, df):
5350
case (None, None):
5451
raise ValueError("init with path or df")
@@ -167,7 +164,6 @@ def _select_agg_cols(cols):
167164
if (x.endswith("_SUM") and (x != "NAMED_ALLELE_DOSAGE_SUM")) or (x in keep_cols)
168165
]
169166

170-
171167
def _melt(df, value_name):
172168
"""Melt the score dataframe from wide format to long format"""
173169
df = df.melt(
@@ -179,3 +175,4 @@ def _melt(df, value_name):
179175
# e.g. PGS000822_SUM -> PGS000822
180176
df["PGS"] = df["PGS"].str.replace(f"_{value_name}", "")
181177
return df
178+

pgscatalog.combineapp/src/pgscatalog/combineapp/cli.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -15,26 +15,25 @@
1515

1616

1717
def run():
18-
logging.basicConfig(
19-
format="%(asctime)s %(name)s %(levelname)-8s %(message)s",
20-
level=logging.WARNING,
21-
datefmt="%Y-%m-%d %H:%M:%S",
22-
)
23-
2418
args = parse_args()
2519

2620
if args.verbose:
27-
logger.setLevel(logging.INFO)
21+
logging.getLogger("pgscatalog.corelib").setLevel(logging.DEBUG)
22+
logger.setLevel(logging.DEBUG)
23+
logger.debug("Verbose logging enabled")
2824

2925
out_path = pathlib.Path(args.outfile)
3026

3127
if out_path.exists():
32-
raise FileExistsError(f"{args.outfile}")
28+
logger.critical(f"Output file already exists: {args.outfile}")
29+
raise FileExistsError
3330

3431
match x := out_path.name:
3532
case _ if x.endswith("gz"):
33+
logger.debug("Compressing output with gzip")
3634
compress_output = True
3735
case _:
36+
logger.debug("Not compressing output")
3837
compress_output = False
3938

4039
paths = list(set(args.scorefiles)) # unique paths only
@@ -52,6 +51,7 @@ def run():
5251
if args.liftover:
5352
chain_dir = pathlib.Path(args.chain_dir)
5453
if not chain_dir.exists():
54+
logger.critical(f"Chain directory is missing: {chain_dir}")
5555
raise FileNotFoundError
5656

5757
liftover_kwargs = {
@@ -65,7 +65,7 @@ def run():
6565
with concurrent.futures.ThreadPoolExecutor() as executor:
6666
futures = []
6767
for scorefile in scoring_files:
68-
logger.info(f"Submitting {scorefile!r}")
68+
logger.info(f"Submitting {scorefile!r} for execution")
6969
futures.append(
7070
executor.submit(
7171
normalise,

pgscatalog.corelib/src/pgscatalog/corelib/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import logging
2+
13
from ._config import Config
24
from .catalogapi import ScoreQueryResult, CatalogQuery, CatalogCategory
35
from .scorefiles import ScoringFiles, ScoringFile, NormalisedScoringFile
@@ -24,6 +26,9 @@
2426
SamplesheetFormatError,
2527
)
2628

29+
log_fmt = "%(name)s: %(asctime)s %(levelname)-8s %(message)s"
30+
logging.basicConfig(format=log_fmt, datefmt="%Y-%m-%d %H:%M:%S")
31+
logger = logging.getLogger(__name__)
2732

2833
__all__ = [
2934
"BasePGSException",

pgscatalog.corelib/src/pgscatalog/corelib/_download.py

+4
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ def score_download_failed(retry_state):
2727
raise ScoreChecksumError("All checksum retries failed") from e
2828
except Exception as download_exc:
2929
raise ScoreDownloadError("All download retries failed") from download_exc
30+
finally:
31+
logger.critical(f"Score download failed after all retries: {retry_state!r}")
3032

3133

3234
@tenacity.retry(
@@ -81,6 +83,7 @@ def ftp_fallback(retry_state):
8183
else:
8284
# no exceptions thrown, move the temporary file to the final output path
8385
os.rename(score_f.name, out_path)
86+
logger.info(f"FTP download OK, {out_path} checksum validation passed")
8487

8588

8689
@tenacity.retry(
@@ -124,3 +127,4 @@ def https_download(*, url, out_path, directory, overwrite):
124127
else:
125128
# no exceptions thrown, move the temporary file to the final output path
126129
os.rename(f.name, out_path)
130+
logger.info(f"HTTPS download OK, {out_path} checksum validation passed")

pgscatalog.corelib/src/pgscatalog/corelib/_normalise.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ def normalise(
2424
2. remap harmonised data, failed harmonisations get None'd
2525
3. log and optionally drop bad variants
2626
"""
27+
logger.info(
28+
f"Normalise parameters: {drop_missing=}, {liftover=}, {chain_dir=}, {target_build=}"
29+
)
30+
2731
if liftover:
2832
variants = lift(
2933
scoring_file=scoring_file,
@@ -169,23 +173,25 @@ def assign_effect_type(variants):
169173
170174
The most common type of effect type is additive:
171175
172-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "FALSE", "is_dominant": "FALSE"})
176+
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "False", "is_dominant": "False"})
173177
>>> list(assign_effect_type([variant])) # doctest: +ELLIPSIS
174178
[ScoreVariant(...,effect_type=EffectType.ADDITIVE,...)]
175-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "TRUE", "is_dominant": "FALSE"})
179+
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "True", "is_dominant": "False"})
176180
>>> list(assign_effect_type([variant])) # doctest: +ELLIPSIS
177181
[ScoreVariant(...,effect_type=EffectType.RECESSIVE,...)]
178-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "FALSE", "is_dominant": "TRUE"})
182+
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "False", "is_dominant": "True"})
179183
>>> list(assign_effect_type([variant])) # doctest: +ELLIPSIS
180184
[ScoreVariant(...,effect_type=EffectType.DOMINANT,...)]
185+
186+
is_recessive and is_dominant fields are parsed from strings to bools during __init__.
181187
"""
182188
for variant in variants:
183189
match (variant.is_recessive, variant.is_dominant):
184-
case (None, None) | ("FALSE", "FALSE"):
190+
case (None, None) | (False, False):
185191
pass # default value is additive, pass to break match and yield
186-
case ("FALSE", "TRUE"):
192+
case (False, True):
187193
variant.effect_type = EffectType.DOMINANT
188-
case ("TRUE", "FALSE"):
194+
case (True, False):
189195
variant.effect_type = EffectType.RECESSIVE
190196
case _:
191197
logger.critical(f"Bad effect type setting: {variant}")

pgscatalog.corelib/src/pgscatalog/corelib/catalogapi.py

+20-7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
""" Classes and functions related to the PGS Catalog API """
22
import enum
3+
import logging
34

45
import httpx
56
import tenacity
@@ -9,6 +10,9 @@
910
from ._config import Config
1011

1112

13+
logger = logging.getLogger(__name__)
14+
15+
1216
class CatalogCategory(enum.Enum):
1317
"""The three main categories in the PGS Catalog
1418
@@ -121,6 +125,7 @@ def infer_category(self):
121125
case _:
122126
raise InvalidAccessionError(f"Invalid accession: {accession!r}")
123127

128+
logger.debug(f"{accession=} {category=}")
124129
return category
125130

126131
def get_query_url(self):
@@ -153,14 +158,14 @@ def get_query_url(self):
153158
154159
Child traits terms aren't included by default. Only traits can have children.
155160
"""
161+
urls: list[str] | str = []
156162
match (self.category, self.accession):
157163
case CatalogCategory.TRAIT, str():
158164
child_flag: int = int(self.include_children)
159-
return f"{self._rest_url_root}/trait/{self.accession}?include_children={child_flag}"
165+
urls = f"{self._rest_url_root}/trait/{self.accession}?include_children={child_flag}"
160166
case CatalogCategory.SCORE, str():
161-
return [f"{self._rest_url_root}/score/search?pgs_ids={self.accession}"]
167+
urls = [f"{self._rest_url_root}/score/search?pgs_ids={self.accession}"]
162168
case CatalogCategory.SCORE, list():
163-
urls: list[str] = []
164169
for chunk in self._chunk_accessions():
165170
chunked_accession = ",".join(chunk)
166171
urls.append(
@@ -169,12 +174,14 @@ def get_query_url(self):
169174
)
170175
return urls
171176
case CatalogCategory.PUBLICATION, str():
172-
return f"{self._rest_url_root}/publication/{self.accession}"
177+
urls = f"{self._rest_url_root}/publication/{self.accession}"
173178
case _:
174179
raise ValueError(
175180
f"Invalid CatalogCategory and accession type: {self.category!r}, "
176181
f"type({self.accession!r})"
177182
)
183+
logger.debug(f"Resolved API query URL: {urls}")
184+
return urls
178185

179186
def _chunk_accessions(self):
180187
size = 50 # /rest/score/{pgs_id} limit when searching multiple IDs
@@ -323,10 +330,16 @@ def get_download_url(self, genome_build=None):
323330
"""
324331
match build := genome_build:
325332
case GenomeBuild() if build == GenomeBuild.GRCh37:
326-
return self.ftp_grch37_url
333+
url = self.ftp_grch37_url
327334
case GenomeBuild() if build == GenomeBuild.GRCh38:
328-
return self.ftp_grch38_url
335+
url = self.ftp_grch38_url
329336
case None:
330-
return self.ftp_url
337+
url = self.ftp_url
331338
case _:
332339
raise ValueError(f"Invalid genome build {build!r}")
340+
341+
logger.debug(
342+
f"Scoring file download URL for {self.pgs_id} with {build=}: {url}"
343+
)
344+
return url
345+

pgscatalog.corelib/src/pgscatalog/corelib/scorefiles.py

+33
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,8 @@ def __init__(self, identifier, target_build=None, query_result=None, **kwargs):
259259
self._directory = self.local_path.parent
260260

261261
def _init_from_accession(self, accession, target_build):
262+
logger.debug("Instantiating ScoringFile from accession")
263+
262264
match self._identifier:
263265
case ScoreQueryResult():
264266
# skip hitting the API unnecessarily
@@ -288,6 +290,8 @@ def _init_from_accession(self, accession, target_build):
288290
self.local_path = None
289291

290292
def _init_from_path(self, target_build=None):
293+
logger.debug(f"Instantiating ScoringFile from {self.local_path=}")
294+
291295
if target_build is not None:
292296
raise ValueError(
293297
"target_build must be None for local files. "
@@ -395,6 +399,9 @@ def download(self, directory, overwrite=False):
395399
self._directory = pathlib.Path(directory)
396400
fn = pathlib.Path(self.path).name
397401
out_path = self._directory / fn
402+
403+
logger.debug(f"Downloading {self.path} to {out_path}")
404+
398405
https_download(
399406
url=self.path,
400407
out_path=out_path,
@@ -467,6 +474,19 @@ def normalise(
467474
('rs78540526', '11', 69701882)
468475
469476
A :class:`LiftoverError` is only raised when many converted coordinates are missing.
477+
478+
Normalising converts the is_dominant and is_recessive optional fields in
479+
scoring files into an EffectType:
480+
481+
>>> testpath = Config.ROOT_DIR / "tests" / "PGS000802_hmPOS_GRCh37.txt"
482+
>>> variants = ScoringFile(testpath).normalise()
483+
>>> for i, x in enumerate(variants): # doctest: +ELLIPSIS
484+
... (x.is_dominant, x.is_recessive, x.effect_type)
485+
... if i == 2:
486+
... break
487+
(True, False, EffectType.DOMINANT)
488+
(False, True, EffectType.RECESSIVE)
489+
(True, False, EffectType.DOMINANT)
470490
"""
471491
yield from normalise(
472492
scoring_file=self,
@@ -478,6 +498,9 @@ def normalise(
478498

479499
def get_log(self, drop_missing=False, variant_log=None):
480500
"""Create a JSON log from a ScoringFile's header and variant rows."""
501+
502+
logger.debug(f"Creating JSON log for {self!r}")
503+
481504
log = {}
482505

483506
for attr in self._header.fields:
@@ -630,6 +653,7 @@ def __init__(self, *args, target_build=None, **kwargs):
630653
for arg in flargs:
631654
match arg:
632655
case ScoringFile() if arg.target_build == target_build:
656+
logger.info("ScoringFile build matches target build")
633657
scorefiles.append(arg)
634658
case ScoringFile() if arg.target_build != target_build:
635659
raise ValueError(
@@ -638,11 +662,18 @@ def __init__(self, *args, target_build=None, **kwargs):
638662
case _ if pathlib.Path(arg).is_file() and target_build is None:
639663
scorefiles.append(ScoringFile(arg))
640664
case _ if pathlib.Path(arg).is_file() and target_build is not None:
665+
logger.info(f"Local path: {arg}, no target build is OK")
666+
scorefiles.append(ScoringFile(arg))
667+
case _ if pathlib.Path(arg).is_file() and target_build is not None:
668+
logger.critical(f"{arg} is a local file and {target_build=}")
641669
raise ValueError(
642670
"Can't load local scoring file when target_build is set"
643671
"Try .normalise() method to do liftover, or load harmonised scoring files from PGS Catalog"
644672
)
645673
case str() if arg.startswith("PGP") or "_" in arg:
674+
logger.info(
675+
"Term associated with multiple scores detected (PGP or trait)"
676+
)
646677
self.include_children = kwargs.get("include_children", None)
647678
traitpub_query = CatalogQuery(
648679
accession=arg, include_children=self.include_children
@@ -656,6 +687,7 @@ def __init__(self, *args, target_build=None, **kwargs):
656687
]
657688
)
658689
case str() if arg.startswith("PGS"):
690+
logger.info("PGS ID detected")
659691
pgs_batch.append(arg)
660692
case str():
661693
raise ValueError(f"{arg!r} is not a valid path or an accession")
@@ -664,6 +696,7 @@ def __init__(self, *args, target_build=None, **kwargs):
664696

665697
# batch PGS IDs to avoid overloading the API
666698
batched_queries = CatalogQuery(accession=pgs_batch).score_query()
699+
logger.debug(f"Batching queries to PGS Catalog API: {pgs_batch}")
667700
batched_scores = [
668701
ScoringFile(x, target_build=target_build) for x in batched_queries
669702
]

0 commit comments

Comments
 (0)