Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update score log structure #49

Merged
merged 12 commits into from
Sep 24, 2024
2 changes: 0 additions & 2 deletions pgscatalog.core/src/pgscatalog/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
ScoringFiles,
ScoringFile,
NormalisedScoringFile,
ScoreVariant,
GenomeBuild,
TargetVariants,
TargetVariant,
Expand All @@ -31,7 +30,6 @@
__all__ = [
"ScoringFiles",
"ScoringFile",
"ScoreVariant",
"Config",
"GenomeBuild",
"CatalogQuery",
Expand Down
8 changes: 0 additions & 8 deletions pgscatalog.core/src/pgscatalog/core/cli/_combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
standardised format, combining them, and calculating some statistics. Only really
useful for the CLI, not good for importing elsewhere."""

import collections
import csv
import functools
import gzip
Expand All @@ -13,13 +12,6 @@
logger = logging.getLogger(__name__)


def get_variant_log(batch):
# these statistics can only be generated while iterating through variants
n_variants = collections.Counter("n_variants" for item in batch)
hm_source = collections.Counter(getattr(item, "hm_source") for item in batch)
return n_variants + hm_source


class DataWriter:
def __init__(self, filename):
self.filename = filename
Expand Down
37 changes: 24 additions & 13 deletions pgscatalog.core/src/pgscatalog/core/cli/combine_cli.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import argparse
import json
import logging
import pathlib
import sys
import textwrap
from typing import Optional

from tqdm import tqdm
from ..lib import GenomeBuild, ScoringFile, ScoreVariant, EffectTypeError

from ._combine import get_variant_log, TextFileWriter
from ..lib.models import ScoreLog, ScoreLogs, ScoreVariant
from ..lib import GenomeBuild, ScoringFile, EffectTypeError

from ._combine import TextFileWriter

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -40,6 +42,12 @@ def run():
target_build = GenomeBuild.from_string(args.target_build)

for x in scoring_files:
if x.genome_build is None and target_build is not None:
raise ValueError(
f"Can't combine files with missing build in "
f"header when requesting {target_build=}"
)

if x.genome_build != target_build and not args.liftover:
raise ValueError(
f"Can't combine scoring file with genome build {x.genome_build!r} when {target_build=} without --liftover"
Expand All @@ -61,10 +69,10 @@ def run():
liftover_kwargs = {"liftover": False}

n_finished = 0
good_scores = []

for scorefile in tqdm(scoring_files, total=len(scoring_files)):
logger.info(f"Processing {scorefile.pgs_id}")
normalised_score: Optional[list[ScoreVariant]] = None
is_compatible = True
try:
normalised_score = list(
scorefile.normalise(
Expand All @@ -77,20 +85,27 @@ def run():
logger.warning(
f"Unsupported non-additive effect types in {scorefile=}, skipping"
)
is_compatible = False
continue
else:
# TODO: go back to parallel execution + write to multiple files
writer = TextFileWriter(compress=compress_output, filename=out_path)

# model_dump returns a dict with a subset of keys
dumped_variants = (
x.model_dump(include=ScoreVariant.output_fields)
x.model_dump(include=set(ScoreVariant.output_fields))
for x in normalised_score
)
writer.write(dumped_variants)
variant_log.append(get_variant_log(normalised_score))
n_finished += 1
good_scores.append(scorefile)
finally:
variant_log.append(
ScoreLog(
header=scorefile.header,
variants=normalised_score,
compatible_effect_type=is_compatible,
)
)

if n_finished == 0:
raise ValueError(
Expand All @@ -100,14 +115,10 @@ def run():
if n_finished != len(scoring_files):
logger.warning(f"{len(scoring_files) - n_finished} scoring files were skipped")

score_log = []
for sf, log in zip(good_scores, variant_log, strict=True):
score_log.append(sf.get_log(variant_log=log))

log_out_path = pathlib.Path(args.outfile).parent / args.logfile
with open(log_out_path, "w") as f:
logger.info(f"Writing log to {f.name}")
json.dump(score_log, f, indent=4)
f.write(ScoreLogs(logs=variant_log).model_dump_json())

logger.info("Combining complete")

Expand Down
2 changes: 0 additions & 2 deletions pgscatalog.core/src/pgscatalog/core/lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from .genomebuild import GenomeBuild
from .catalogapi import ScoreQueryResult, CatalogQuery, CatalogCategory
from .scorefiles import ScoringFiles, ScoringFile, NormalisedScoringFile
from .scorevariant import ScoreVariant
from .targetvariants import TargetVariants, TargetVariant, TargetType
from ._relabel import RelabelArgs, relabel, relabel_write
from ._sortpaths import effect_type_keyfunc, chrom_keyfunc
Expand Down Expand Up @@ -51,7 +50,6 @@
"SamplesheetFormatError",
"ScoringFiles",
"ScoringFile",
"ScoreVariant",
"Config",
"GenomeBuild",
"CatalogQuery",
Expand Down
12 changes: 6 additions & 6 deletions pgscatalog.core/src/pgscatalog/core/lib/_normalise.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@ def normalise(
if liftover:
variants = lift(
scoring_file=scoring_file,
harmonised=scoring_file.harmonised,
harmonised=scoring_file.is_harmonised,
current_build=scoring_file.genome_build,
target_build=target_build,
chain_dir=chain_dir,
)
else:
variants = scoring_file.variants

variants = remap_harmonised(variants, scoring_file.harmonised, target_build)
variants = remap_harmonised(variants, scoring_file.is_harmonised, target_build)

if drop_missing:
variants = drop_hla(variants)
Expand Down Expand Up @@ -100,7 +100,7 @@ def check_duplicates(variants):
def drop_hla(variants):
"""Drop HLA alleles from a list of ScoreVariants

>>> from .scorevariant import ScoreVariant
>>> from .models import ScoreVariant
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "chr_name": "1", "chr_position": 1})
>>> list(drop_hla([variant])) # doctest: +ELLIPSIS
[ScoreVariant(..., effect_allele=Allele(allele='A', is_snp=True), ...
Expand All @@ -127,7 +127,7 @@ def drop_hla(variants):
def assign_other_allele(variants):
"""Check if there's more than one possible other allele, remove if true

>>> from .scorevariant import ScoreVariant
>>> from .models import ScoreVariant
>>> variant = ScoreVariant(**{"chr_position": 1, "rsID": None, "chr_name": "1", "effect_allele": "A", "effect_weight": 5, "other_allele": "A", "row_nr": 0, "accession": "test"})
>>> list(assign_other_allele([variant]))[0] # doctest: +ELLIPSIS
ScoreVariant(..., effect_allele=Allele(allele='A', is_snp=True), other_allele=Allele(allele='A', is_snp=True), ...)
Expand All @@ -154,7 +154,7 @@ def remap_harmonised(variants, harmonised, target_build):
In this case chr_name, chr_position, and other allele are missing.
Perhaps authors submitted rsID and effect allele originally:

>>> from .scorevariant import ScoreVariant
>>> from .models import ScoreVariant
>>> variant = ScoreVariant(**{"chr_position": 1, "rsID": None, "chr_name": "2", "effect_allele": "A", "effect_weight": 5, "accession": "test", "hm_chr": "1", "hm_pos": 100, "hm_rsID": "testrsid", "hm_inferOtherAllele": "A", "row_nr": 0})
>>> variant
ScoreVariant(..., effect_allele=Allele(allele='A', is_snp=True), other_allele=None, ...
Expand Down Expand Up @@ -184,7 +184,7 @@ def check_effect_allele(variants, drop_missing=False):
"""
Odd effect allele:

>>> from .scorevariant import ScoreVariant
>>> from .models import ScoreVariant
>>> variant = ScoreVariant(**{"effect_allele": "Z", "effect_weight": 5, "accession": "test", "row_nr": 0, "chr_name": "1", "chr_position": 1})
>>> list(check_effect_allele([variant], drop_missing=True)) # doctest: +ELLIPSIS
[]
Expand Down
34 changes: 1 addition & 33 deletions pgscatalog.core/src/pgscatalog/core/lib/_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
These functions aren't really meant to be imported outside corelib"""

import logging
import pathlib

from xopen import xopen

from .scorevariant import ScoreVariant
from .models import ScoreVariant

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -37,23 +36,6 @@ def read_rows_lazy(
row_nr += 1


def generate_header_lines(f):
"""Header lines in a PGS Catalog scoring file are structured like:

#pgs_id=PGS000348
#pgs_name=PRS_PrCa

Files can be big, so we want to only read header lines and stop immediately
"""
for line in f:
if line.startswith("#"):
if "=" in line:
yield line.strip()
else:
# stop reading lines
break


def get_columns(path):
"""Grab column labels from a PGS Catalog scoring file. line_no is useful to skip the header"""
with xopen(path, mode="rt") as f:
Expand All @@ -79,17 +61,3 @@ def detect_wide(cols: list[str]) -> bool:
return True
else:
return False


def read_header(path: pathlib.Path):
"""Parses the header of a PGS Catalog format scoring file into a dictionary"""
header = {}

with xopen(path, "rt") as f:
header_text = generate_header_lines(f)

for item in header_text:
key, value = item.split("=")
header[key[1:]] = value # drop # character from key

return header
Loading