Skip to content

Commit 30a071f

Browse files
committed
split models into CatalogScoreVariant and ScoreVariant
1 parent 1f4d4a9 commit 30a071f

File tree

2 files changed

+142
-149
lines changed

2 files changed

+142
-149
lines changed

pgscatalog.core/src/pgscatalog/core/lib/_normalise.py

+13-79
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import pyliftover
1010

1111
from .genomebuild import GenomeBuild
12-
from .scorevariant import EffectType, ScoreVariant, EffectAllele
12+
from .scorevariant import ScoreVariant, Allele
1313
from .pgsexceptions import LiftoverError
1414

1515
logger = logging.getLogger(__name__)
@@ -45,8 +45,6 @@ def normalise(
4545
if drop_missing:
4646
variants = drop_hla(variants)
4747

48-
variants = assign_effect_type(variants)
49-
variants = check_effect_weight(variants)
5048
variants = assign_other_allele(variants)
5149
variants = check_effect_allele(variants, drop_missing)
5250
variants = detect_complex(variants)
@@ -75,19 +73,11 @@ def check_duplicates(variants):
7573
seen_ids = {}
7674
current_accession = accession
7775

78-
# None other allele -> empty string
79-
variant_id: str = ":".join(
80-
[
81-
str(getattr(variant, k) or "")
82-
for k in ["chr_name", "chr_position", "effect_allele", "other_allele"]
83-
]
84-
)
85-
86-
if variant_id in seen_ids:
76+
if variant.variant_id in seen_ids:
8777
variant.is_duplicated = True
8878
n_duplicates += 1
8979

90-
seen_ids[variant_id] = True
80+
seen_ids[variant.variant_id] = True
9181

9282
yield variant
9383
n_variants += 1
@@ -110,9 +100,12 @@ def drop_hla(variants):
110100
[]
111101
"""
112102
n_dropped = 0
103+
p = Allele(allele="P")
104+
n = Allele(allele="N")
105+
113106
for variant in variants:
114107
match variant:
115-
case _ if variant.effect_allele in (EffectAllele("P"), EffectAllele("N")):
108+
case _ if variant.effect_allele in (p, n):
116109
n_dropped += 1
117110
continue
118111
case _:
@@ -121,39 +114,15 @@ def drop_hla(variants):
121114
logger.warning(f"{n_dropped} HLA alleles detected and dropped")
122115

123116

124-
def check_effect_weight(variants):
125-
"""Check that effect weights are valid floats. Effect weights are intentionally
126-
left as strings during processing.
127-
128-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0})
129-
>>> list(check_effect_weight([variant])) # doctest: +ELLIPSIS
130-
[ScoreVariant(effect_allele='A',effect_weight=5,...)]
131-
132-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": "potato", "accession": "test", "row_nr": 0})
133-
>>> list(check_effect_weight([variant])) # doctest: +ELLIPSIS
134-
Traceback (most recent call last):
135-
...
136-
ValueError
137-
"""
138-
for variant in variants:
139-
try:
140-
float(variant.effect_weight)
141-
except ValueError as e:
142-
logger.critical(f"{variant} has bad effect weight")
143-
raise ValueError from e
144-
else:
145-
yield variant
146-
147-
148117
def assign_other_allele(variants):
149118
"""Check if there's more than one possible other allele, remove if true
150119
151-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "other_allele": "A"})
120+
>>> variant = ScoreVariant(**{"chr_position": 1, "rsID": None, "chr_name": "1", "effect_allele": "A", "effect_weight": 5, "other_allele": "A"})
152121
>>> list(assign_other_allele([variant])) # doctest: +ELLIPSIS
153-
[ScoreVariant(effect_allele='A',...,other_allele='A',...)]
154-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "other_allele": "A/C"})
122+
[ScoreVariant(..., effect_allele='A', other_allele='A', ...)]
123+
>>> variant = ScoreVariant(**{"chr_position": 1, "rsID": None, "chr_name": "1", "effect_allele": "A", "effect_weight": 5, "other_allele": "A/C"})
155124
>>> list(assign_other_allele([variant])) # doctest: +ELLIPSIS
156-
[ScoreVariant(effect_allele='A',...,other_allele=None,...)]
125+
[ScoreVariant(..., effect_allele='A', other_allele=None,...)]
157126
"""
158127
n_dropped = 0
159128
for variant in variants:
@@ -171,51 +140,16 @@ def assign_other_allele(variants):
171140
logger.warning("Other allele for these variants is set to missing")
172141

173142

174-
def assign_effect_type(variants):
175-
"""Convert PGS Catalog effect type columns to EffectType enums
176-
177-
The most common type of effect type is additive:
178-
179-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "False", "is_dominant": "False"})
180-
>>> list(assign_effect_type([variant])) # doctest: +ELLIPSIS
181-
[ScoreVariant(...,effect_type=EffectType.ADDITIVE,...)]
182-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "True", "is_dominant": "False"})
183-
>>> list(assign_effect_type([variant])) # doctest: +ELLIPSIS
184-
[ScoreVariant(...,effect_type=EffectType.RECESSIVE,...)]
185-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "is_recessive": "False", "is_dominant": "True"})
186-
>>> list(assign_effect_type([variant])) # doctest: +ELLIPSIS
187-
[ScoreVariant(...,effect_type=EffectType.DOMINANT,...)]
188-
189-
is_recessive and is_dominant fields are parsed from strings to bools during __init__.
190-
"""
191-
for variant in variants:
192-
match (variant.is_recessive, variant.is_dominant):
193-
case (None, None) | (False, False) | (None, False) | (False, None):
194-
# none is OK because is_recessive or is_dominant column may be missing
195-
# default value is already set to additive, so just yield the variant
196-
pass
197-
case (False, True) | (None, True):
198-
# none is OK because is_recessive column may be missing
199-
variant.effect_type = EffectType.DOMINANT
200-
case (True, False) | (True, None):
201-
# none is OK because is_dominant column may be missing
202-
variant.effect_type = EffectType.RECESSIVE
203-
case _:
204-
logger.critical(f"Bad effect type setting: {variant}")
205-
raise Exception
206-
yield variant
207-
208-
209143
def remap_harmonised(variants, harmonised, target_build):
210144
"""
211145
Overwrite key attributes with harmonised data, if available.
212146
213147
In this case chr_name, chr_position, and other allele are missing.
214148
Perhaps authors submitted rsID and effect allele originally:
215149
216-
>>> variant = ScoreVariant(**{"effect_allele": "A", "effect_weight": 5, "accession": "test", "row_nr": 0, "hm_chr": 1, "hm_pos": 100, "hm_inferOtherAllele": "A"})
150+
>>> variant = ScoreVariant(**{"chr_position": 1, "rsID": None, "chr_name": "2", "effect_allele": "A", "effect_weight": 5, "accession": "test", "hm_chr": 1, "hm_pos": 100, "hm_inferOtherAllele": "A"})
217151
>>> list(remap_harmonised([variant], harmonised=True, target_build=GenomeBuild.GRCh38)) # doctest: +ELLIPSIS
218-
[ScoreVariant(...,chr_name=1,chr_position=100,...other_allele='A'...)]
152+
[ScoreVariant(chr_name=1,chr_position=100,...other_allele='A'...)]
219153
"""
220154
if harmonised:
221155
for variant in variants:

0 commit comments

Comments
 (0)