Skip to content

Commit a13f3a1

Browse files
committed
set up logging
1 parent 563a442 commit a13f3a1

File tree

11 files changed

+154
-33
lines changed

11 files changed

+154
-33
lines changed

pgscatalog.combineapp/src/pgscatalog/combineapp/cli.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -15,26 +15,25 @@
1515

1616

1717
def run():
18-
logging.basicConfig(
19-
format="%(asctime)s %(name)s %(levelname)-8s %(message)s",
20-
level=logging.WARNING,
21-
datefmt="%Y-%m-%d %H:%M:%S",
22-
)
23-
2418
args = parse_args()
2519

2620
if args.verbose:
27-
logger.setLevel(logging.INFO)
21+
logging.getLogger("pgscatalog.corelib").setLevel(logging.DEBUG)
22+
logger.setLevel(logging.DEBUG)
23+
logger.debug("Verbose logging enabled")
2824

2925
out_path = pathlib.Path(args.outfile)
3026

3127
if out_path.exists():
32-
raise FileExistsError(f"{args.outfile}")
28+
logger.critical(f"Output file already exists: {args.outfile}")
29+
raise FileExistsError
3330

3431
match x := out_path.name:
3532
case _ if x.endswith("gz"):
33+
logger.debug("Compressing output with gzip")
3634
compress_output = True
3735
case _:
36+
logger.debug("Not compressing output")
3837
compress_output = False
3938

4039
paths = list(set(args.scorefiles)) # unique paths only
@@ -52,6 +51,7 @@ def run():
5251
if args.liftover:
5352
chain_dir = pathlib.Path(args.chain_dir)
5453
if not chain_dir.exists():
54+
logger.critical(f"Chain directory is missing: {chain_dir}")
5555
raise FileNotFoundError
5656

5757
liftover_kwargs = {
@@ -65,7 +65,7 @@ def run():
6565
with concurrent.futures.ThreadPoolExecutor() as executor:
6666
futures = []
6767
for scorefile in scoring_files:
68-
logger.info(f"Submitting {scorefile!r}")
68+
logger.info(f"Submitting {scorefile!r} for execution")
6969
futures.append(
7070
executor.submit(
7171
normalise,

pgscatalog.corelib/src/pgscatalog/corelib/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import logging
2+
13
from ._config import Config
24
from .catalogapi import ScoreQueryResult, CatalogQuery, CatalogCategory
35
from .scorefiles import ScoringFiles, ScoringFile, NormalisedScoringFile
@@ -24,6 +26,9 @@
2426
SamplesheetFormatError,
2527
)
2628

29+
log_fmt = "%(name)s: %(asctime)s %(levelname)-8s %(message)s"
30+
logging.basicConfig(format=log_fmt, datefmt="%Y-%m-%d %H:%M:%S")
31+
logger = logging.getLogger(__name__)
2732

2833
__all__ = [
2934
"BasePGSException",

pgscatalog.corelib/src/pgscatalog/corelib/_download.py

+4
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ def score_download_failed(retry_state):
2727
raise ScoreChecksumError("All checksum retries failed") from e
2828
except Exception as download_exc:
2929
raise ScoreDownloadError("All download retries failed") from download_exc
30+
finally:
31+
logger.critical(f"Score download failed after all retries: {retry_state!r}")
3032

3133

3234
@tenacity.retry(
@@ -81,6 +83,7 @@ def ftp_fallback(retry_state):
8183
else:
8284
# no exceptions thrown, move the temporary file to the final output path
8385
os.rename(score_f.name, out_path)
86+
logger.info(f"FTP download OK, {out_path} checksum validation passed")
8487

8588

8689
@tenacity.retry(
@@ -124,3 +127,4 @@ def https_download(*, url, out_path, directory, overwrite):
124127
else:
125128
# no exceptions thrown, move the temporary file to the final output path
126129
os.rename(f.name, out_path)
130+
logger.info(f"HTTPS download OK, {out_path} checksum validation passed")

pgscatalog.corelib/src/pgscatalog/corelib/_normalise.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ def normalise(
2424
2. remap harmonised data, failed harmonisations get None'd
2525
3. log and optionally drop bad variants
2626
"""
27+
logger.info(
28+
f"Normalise parameters: {drop_missing=}, {liftover=}, {chain_dir=}, {target_build=}"
29+
)
30+
2731
if liftover:
2832
variants = lift(
2933
scoring_file=scoring_file,
@@ -169,23 +173,25 @@ def assign_effect_type(variants):
169173
170174
The most common type of effect type is additive:
171175
172-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "FALSE", "is_dominant": "FALSE"})
176+
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "False", "is_dominant": "False"})
173177
>>> list(assign_effect_type([variant])) # doctest: +ELLIPSIS
174178
[ScoreVariant(...,effect_type=EffectType.ADDITIVE,...)]
175-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "TRUE", "is_dominant": "FALSE"})
179+
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "True", "is_dominant": "False"})
176180
>>> list(assign_effect_type([variant])) # doctest: +ELLIPSIS
177181
[ScoreVariant(...,effect_type=EffectType.RECESSIVE,...)]
178-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "FALSE", "is_dominant": "TRUE"})
182+
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "False", "is_dominant": "True"})
179183
>>> list(assign_effect_type([variant])) # doctest: +ELLIPSIS
180184
[ScoreVariant(...,effect_type=EffectType.DOMINANT,...)]
185+
186+
is_recessive and is_dominant fields are parsed from strings to bools during __init__.
181187
"""
182188
for variant in variants:
183189
match (variant.is_recessive, variant.is_dominant):
184-
case (None, None) | ("FALSE", "FALSE"):
190+
case (None, None) | (False, False):
185191
pass # default value is additive, pass to break match and yield
186-
case ("FALSE", "TRUE"):
192+
case (False, True):
187193
variant.effect_type = EffectType.DOMINANT
188-
case ("TRUE", "FALSE"):
194+
case (True, False):
189195
variant.effect_type = EffectType.RECESSIVE
190196
case _:
191197
logger.critical(f"Bad effect type setting: {variant}")

pgscatalog.corelib/src/pgscatalog/corelib/catalogapi.py

+19-7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
""" Classes and functions related to the PGS Catalog API """
22
import enum
3+
import logging
34

45
import httpx
56
import tenacity
@@ -9,6 +10,9 @@
910
from ._config import Config
1011

1112

13+
logger = logging.getLogger(__name__)
14+
15+
1216
class CatalogCategory(enum.Enum):
1317
"""The three main categories in the PGS Catalog
1418
@@ -121,6 +125,7 @@ def infer_category(self):
121125
case _:
122126
raise InvalidAccessionError(f"Invalid accession: {accession!r}")
123127

128+
logger.debug(f"{accession=} {category=}")
124129
return category
125130

126131
def get_query_url(self):
@@ -153,14 +158,14 @@ def get_query_url(self):
153158
154159
Child traits terms aren't included by default. Only traits can have children.
155160
"""
161+
urls: list[str] | str = []
156162
match (self.category, self.accession):
157163
case CatalogCategory.TRAIT, str():
158164
child_flag: int = int(self.include_children)
159-
return f"{self._rest_url_root}/trait/{self.accession}?include_children={child_flag}"
165+
urls = f"{self._rest_url_root}/trait/{self.accession}?include_children={child_flag}"
160166
case CatalogCategory.SCORE, str():
161-
return [f"{self._rest_url_root}/score/search?pgs_ids={self.accession}"]
167+
urls = [f"{self._rest_url_root}/score/search?pgs_ids={self.accession}"]
162168
case CatalogCategory.SCORE, list():
163-
urls: list[str] = []
164169
for chunk in self._chunk_accessions():
165170
chunked_accession = ",".join(chunk)
166171
urls.append(
@@ -169,12 +174,14 @@ def get_query_url(self):
169174
)
170175
return urls
171176
case CatalogCategory.PUBLICATION, str():
172-
return f"{self._rest_url_root}/publication/{self.accession}"
177+
urls = f"{self._rest_url_root}/publication/{self.accession}"
173178
case _:
174179
raise ValueError(
175180
f"Invalid CatalogCategory and accession type: {self.category!r}, "
176181
f"type({self.accession!r})"
177182
)
183+
logger.debug(f"Resolved API query URL: {urls}")
184+
return urls
178185

179186
def _chunk_accessions(self):
180187
size = 50 # /rest/score/{pgs_id} limit when searching multiple IDs
@@ -323,10 +330,15 @@ def get_download_url(self, genome_build=None):
323330
"""
324331
match build := genome_build:
325332
case GenomeBuild() if build == GenomeBuild.GRCh37:
326-
return self.ftp_grch37_url
333+
url = self.ftp_grch37_url
327334
case GenomeBuild() if build == GenomeBuild.GRCh38:
328-
return self.ftp_grch38_url
335+
url = self.ftp_grch38_url
329336
case None:
330-
return self.ftp_url
337+
url = self.ftp_url
331338
case _:
332339
raise ValueError(f"Invalid genome build {build!r}")
340+
341+
logger.debug(
342+
f"Scoring file download URL for {self.pgs_id} with {build=}: {url}"
343+
)
344+
return url

pgscatalog.corelib/src/pgscatalog/corelib/scorefiles.py

+27
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,8 @@ def __init__(self, identifier, target_build=None, query_result=None, **kwargs):
259259
self._directory = self.local_path.parent
260260

261261
def _init_from_accession(self, accession, target_build):
262+
logger.debug(f"Instantiating ScoringFile from accession {accession.pgs_id}")
263+
262264
match self._identifier:
263265
case ScoreQueryResult():
264266
# skip hitting the API unnecessarily
@@ -288,6 +290,8 @@ def _init_from_accession(self, accession, target_build):
288290
self.local_path = None
289291

290292
def _init_from_path(self, target_build=None):
293+
logger.debug(f"Instantiating ScoringFile from {self.local_path=}")
294+
291295
if target_build is not None:
292296
raise ValueError(
293297
"target_build must be None for local files. "
@@ -395,6 +399,7 @@ def download(self, directory, overwrite=False):
395399
self._directory = pathlib.Path(directory)
396400
fn = pathlib.Path(self.path).name
397401
out_path = self._directory / fn
402+
logger.debug(f"Downloading {self.path} to {out_path}")
398403
https_download(
399404
url=self.path,
400405
out_path=out_path,
@@ -467,6 +472,19 @@ def normalise(
467472
('rs78540526', '11', 69701882)
468473
469474
A :class:`LiftoverError` is only raised when many converted coordinates are missing.
475+
476+
Normalising converts the is_dominant and is_recessive optional fields in
477+
scoring files into an EffectType:
478+
479+
>>> testpath = Config.ROOT_DIR / "tests" / "PGS000802_hmPOS_GRCh37.txt"
480+
>>> variants = ScoringFile(testpath).normalise()
481+
>>> for i, x in enumerate(variants): # doctest: +ELLIPSIS
482+
... (x.is_dominant, x.is_recessive, x.effect_type)
483+
... if i == 2:
484+
... break
485+
(True, False, EffectType.DOMINANT)
486+
(False, True, EffectType.RECESSIVE)
487+
(True, False, EffectType.DOMINANT)
470488
"""
471489
yield from normalise(
472490
scoring_file=self,
@@ -478,6 +496,7 @@ def normalise(
478496

479497
def get_log(self, drop_missing=False, variant_log=None):
480498
"""Create a JSON log from a ScoringFile's header and variant rows."""
499+
logger.debug(f"Creating JSON log for {self!r}")
481500
log = {}
482501

483502
for attr in self._header.fields:
@@ -630,19 +649,25 @@ def __init__(self, *args, target_build=None, **kwargs):
630649
for arg in flargs:
631650
match arg:
632651
case ScoringFile() if arg.target_build == target_build:
652+
logger.info("ScoringFile build matches target build")
633653
scorefiles.append(arg)
634654
case ScoringFile() if arg.target_build != target_build:
635655
raise ValueError(
636656
f"{arg.target_build=} doesn't match {target_build=}"
637657
)
638658
case _ if pathlib.Path(arg).is_file() and target_build is None:
659+
logger.info(f"Local path: {arg}, no target build is OK")
639660
scorefiles.append(ScoringFile(arg))
640661
case _ if pathlib.Path(arg).is_file() and target_build is not None:
662+
logger.critical(f"{arg} is a local file and {target_build=}")
641663
raise ValueError(
642664
"Can't load local scoring file when target_build is set"
643665
"Try .normalise() method to do liftover, or load harmonised scoring files from PGS Catalog"
644666
)
645667
case str() if arg.startswith("PGP") or "_" in arg:
668+
logger.info(
669+
"Term associated with multiple scores detected (PGP or trait)"
670+
)
646671
self.include_children = kwargs.get("include_children", None)
647672
traitpub_query = CatalogQuery(
648673
accession=arg, include_children=self.include_children
@@ -656,6 +681,7 @@ def __init__(self, *args, target_build=None, **kwargs):
656681
]
657682
)
658683
case str() if arg.startswith("PGS"):
684+
logger.info("PGS ID detected")
659685
pgs_batch.append(arg)
660686
case str():
661687
raise ValueError(f"{arg!r} is not a valid path or an accession")
@@ -664,6 +690,7 @@ def __init__(self, *args, target_build=None, **kwargs):
664690

665691
# batch PGS IDs to avoid overloading the API
666692
batched_queries = CatalogQuery(accession=pgs_batch).score_query()
693+
logger.debug(f"Batching queries to PGS Catalog API: {pgs_batch}")
667694
batched_scores = [
668695
ScoringFile(x, target_build=target_build) for x in batched_queries
669696
]

pgscatalog.corelib/src/pgscatalog/corelib/scorevariant.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -204,8 +204,23 @@ def __init__(
204204

205205
self.hm_inferOtherAllele: Optional[str] = hm_inferOtherAllele
206206
self.hm_source: Optional[str] = hm_source
207-
self.is_dominant: Optional[bool] = is_dominant
208-
self.is_recessive: Optional[bool] = is_recessive
207+
208+
match is_dominant:
209+
case True | "True":
210+
self.is_dominant = True
211+
case None:
212+
self.is_dominant = None
213+
case _:
214+
self.is_dominant = False
215+
216+
match is_recessive:
217+
case True | "True":
218+
self.is_recessive = True
219+
case None:
220+
self.is_recessive = None
221+
case _:
222+
self.is_recessive = False
223+
209224
self.hm_rsID: Optional[str] = hm_rsID
210225
self.hm_match_chr: Optional[str] = hm_match_chr
211226
self.hm_match_pos: Optional[str] = hm_match_pos
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
###PGS CATALOG SCORING FILE - see https://www.pgscatalog.org/downloads/#dl_ftp_scoring for additional information
2+
#format_version=2.0
3+
##POLYGENIC SCORE (PGS) INFORMATION
4+
#pgs_id=PGS000802
5+
#pgs_name=CRC_19
6+
#trait_reported=Colorectal cancer
7+
#trait_mapped=colorectal cancer
8+
#trait_efo=EFO_0005842
9+
#genome_build=NR
10+
#variants_number=19
11+
#weight_type=NR
12+
##SOURCE INFORMATION
13+
#pgp_id=PGP000191
14+
#citation=He CY et al. Genomics (2021). doi:10.1016/j.ygeno.2021.01.025
15+
##HARMONIZATION DETAILS
16+
#HmPOS_build=GRCh37
17+
#HmPOS_date=2022-07-28
18+
#HmPOS_match_chr={"True": null, "False": null}
19+
#HmPOS_match_pos={"True": null, "False": null}
20+
rsID chr_name chr_position effect_allele other_allele effect_weight allelefrequency_effect is_dominant is_recessive locus_name hm_source hm_rsID hm_chr hm_pos hm_inferOtherAllele
21+
rs10936599 3 170974795 T C 0.123 0.377 True False MYNN ENSEMBL rs10936599 3 169492101
22+
rs6061231 20 60390312 A C 0.491 0.892 False True LOC100128184, RPS21 ENSEMBL rs6061231 20 60956917
23+
rs10774214 12 4238613 C T 0.122 0.637 True False LOC100129645, CCND2 ENSEMBL rs10774214 12 4368352
24+
rs10795668 10 8741225 A G 0.147 0.61 True False LOC338591, LOC389936 ENSEMBL rs10795668 10 8701219
25+
rs11903757 2 192295449 T C 0.185 0.925 False True LOC100132133, hCG_2045843 ENSEMBL rs11903757 2 192587204
26+
rs12603526 17 747343 T C 0.142 0.695 False True NXN ENSEMBL rs12603526 17 800593
27+
rs1321311 6 36730878 G T 0.135 0.876 True False SFRS3, LOC389386 ENSEMBL rs1321311 6 36622900
28+
rs2423279 20 7760350 T C 0.193 0.715 False True FUSIP1P2 ENSEMBL rs2423279 20 7812350
29+
rs3802842 11 110676919 A C 0.177 0.663 True False LOC120376 ENSEMBL rs3802842 11 111171709
30+
rs4813802 20 6647595 T G 0.189 0.816 False True LOC728383 ENSEMBL rs4813802 20 6699595
31+
rs6469656 8 117716969 G A 0.204 0.627 False True TRPS1, EIF3H ENSEMBL rs6469656 8 117647788
32+
rs647161 5 134526991 C A 0.277 0.663 False True LOC389328, H2AFY ENSEMBL rs647161 5 134499092
33+
rs704017 10 80489138 A G 0.123 0.644 True False LOC100132987, ZMIZ1 ENSEMBL rs704017 10 80819132
34+
rs7315438 12 114375786 C T 0.146 0.388 True False TBX3, LOC100129020 ENSEMBL rs7315438 12 115891403
35+
rs10411210 19 38224140 C T 0.232 0.836 False False RHPN2 ENSEMBL rs10411210 19 33532300
36+
rs12953717 18 44707927 T C 0.157 0.789 False False SMAD7 ENSEMBL rs12953717 18 46453929
37+
rs16969681 15 30780403 T C 0.175 0.583 False False C15orf45, GREM1 ENSEMBL rs16969681 15 32993111
38+
rs1801133 1 11778965 A G 0.16 0.592 False False MTHFR ENSEMBL rs1801133 1 11856378
39+
rs6983267 8 128482487 G T 0.217 0.376 False False FAM84B, POU5F1P1 ENSEMBL rs6983267 8 128413305

0 commit comments

Comments
 (0)