Skip to content

Commit 52f8a2e

Browse files
committed
update header models
1 parent a08e196 commit 52f8a2e

File tree

1 file changed

+116
-50
lines changed
  • pgscatalog.core/src/pgscatalog/core/lib

1 file changed

+116
-50
lines changed

pgscatalog.core/src/pgscatalog/core/lib/models.py

+116-50
Original file line numberDiff line numberDiff line change
@@ -382,73 +382,49 @@ class ScoreFormatVersion(str, enum.Enum):
382382
v2 = "2.0"
383383

384384

385-
class CatalogScoreHeader(BaseModel):
386-
"""Headers store useful metadata about a scoring file.
385+
class WeightType(str, enum.Enum):
386+
BETA = "beta"
387+
ODDSRATIO = "OR"
388+
HAZARDRATIO = "HR"
389+
NR = "NR"
387390

388-
This class provides convenient functions for reading and extracting information
389-
from the header. The header must follow PGS Catalog standards. It's always best
390-
to build headers with ``from_path()``:
391391

392-
>>> from ._config import Config
393-
>>> testpath = Config.ROOT_DIR / "tests" / "data" / "PGS000001_hmPOS_GRCh38.txt.gz"
394-
>>> CatalogScoreHeader.from_path(testpath) # doctest: +ELLIPSIS
395-
CatalogScoreHeader(format_version=<ScoreFormatVersion.v2: '2.0'>, pgs_id='PGS000001', pgs_name='PRS77_BC', trait_reported='Breast cancer', trait_mapped='breast carcinoma', trait_efo='EFO_0000305', genome_build=None, variants_number=77, weight_type='NR', pgp_id='PGP000001', citation='Mavaddat N et al. J Natl Cancer Inst (2015). doi:10.1093/jnci/djv036', HmPOS_build=GenomeBuild.GRCh38, HmPOS_date=datetime.date(2022, 7, 29), HmPOS_match_pos='{"True": null, "False": null}', HmPOS_match_chr='{"True": null, "False": null}')
392+
class ScoreHeader(BaseModel):
393+
"""Headers store useful metadata about a scoring file.
396394
397-
Harmonisation fields are optional:
395+
Data validation is less strict than the CatalogScoreHeader, to make
396+
it easier for people to use custom scoring files with the PGS Catalog Calculator.
398397
399-
>>> CatalogScoreHeader(**{"format_version": "2.0", "pgs_id": "PGS123456", "pgs_name": "testpgs", "trait_reported": "testtrait", "trait_mapped": "testtrait", "trait_efo": "testtrait", "genome_build": "NR", "variants_number": 2, "weight_type": "NR", "pgp_id": "PGP123456", "citation": "yes please"})
400-
CatalogScoreHeader(format_version=<ScoreFormatVersion.v2: '2.0'>, pgs_id='PGS123456', pgs_name='testpgs', trait_reported='testtrait', trait_mapped='testtrait', trait_efo='testtrait', genome_build=None, variants_number=2, weight_type='NR', pgp_id='PGP123456', citation='yes please', HmPOS_build=None, HmPOS_date=None, HmPOS_match_pos=None, HmPOS_match_chr=None)
398+
>>> ScoreHeader(**{"pgs_id": "PGS123456", "trait_reported": "testtrait", "genome_build": "GRCh38"})
399+
ScoreHeader(pgs_id='PGS123456', pgs_name=None, trait_reported='testtrait', genome_build=GenomeBuild.GRCh38)
401400
"""
402401

403-
###PGS CATALOG SCORING FILE - see https://www.pgscatalog.org/downloads/#dl_ftp_scoring for additional information
404-
format_version: ScoreFormatVersion
405-
##POLYGENIC SCORE (PGS) INFORMATION
406-
pgs_id: str
407-
pgs_name: str
408-
trait_reported: str
409-
trait_mapped: str
410-
trait_efo: str
411-
genome_build: Optional[GenomeBuild]
412-
variants_number: int = Field(ge=0)
413-
weight_type: str
414-
##SOURCE INFORMATION
415-
pgp_id: str
416-
citation: str
417-
##HARMONIZATION DETAILS
418-
HmPOS_build: Optional[GenomeBuild] = None
419-
HmPOS_date: Optional[date] = None
420-
HmPOS_match_pos: Optional[str] = None
421-
HmPOS_match_chr: Optional[str] = None
422-
# note: only included when different from default
423-
license: Optional[str] = Field(
424-
"PGS obtained from the Catalog should be cited appropriately, and "
425-
"used in accordance with any licensing restrictions set by the authors. See "
426-
"EBI Terms of Use (https://www.ebi.ac.uk/about/terms-of-use/) for additional "
427-
"details.",
428-
repr=False,
429-
)
402+
pgs_id: str = Field(title="PGS identifier")
403+
pgs_name: Optional[str] = Field(description="PGS name", default=None)
404+
trait_reported: str = Field(description="Trait name")
405+
# genome build is Optional because "NR" is represented internally as None
406+
genome_build: Optional[GenomeBuild] = Field(description="Genome build")
430407

431-
@field_validator("pgs_id")
432408
@classmethod
433-
def check_pgs_id(cls, pgs_id: str) -> str:
434-
if not pgs_id.startswith("PGS"):
435-
raise ValueError(f"pgs_id doesn't start with PGS: {pgs_id}")
436-
if len(pgs_id) != 9:
437-
raise ValueError(f"Invalid PGS ID format: {pgs_id}")
438-
return pgs_id
409+
def _parse_genome_build(cls, value: str) -> Optional[GenomeBuild]:
410+
if value == "NR":
411+
return None
412+
else:
413+
return GenomeBuild.from_string(value)
439414

440415
@field_validator("genome_build", mode="before")
441416
@classmethod
442-
def parse_genome_build(cls, weight: str) -> GenomeBuild:
443-
return GenomeBuild.from_string(weight)
417+
def parse_genome_build(cls, value: str) -> Optional[GenomeBuild]:
418+
return cls._parse_genome_build(value)
444419

445420
@classmethod
446421
def from_path(cls, path):
447-
# TODO: I copied some library functions here for testing, clean em up or unify here
448422
header = {}
449423

450-
def generate_header(path):
424+
def generate_header(f):
451425
for line in f:
426+
if line.startswith("##"):
427+
continue
452428
if line.startswith("#"):
453429
if "=" in line:
454430
yield line.strip()
@@ -464,3 +440,93 @@ def generate_header(path):
464440
header[key[1:]] = value # drop # character from key
465441

466442
return cls(**header)
443+
444+
445+
class CatalogScoreHeader(ScoreHeader):
446+
"""A ScoreHeader that validates the PGS Catalog Scoring File header standard
447+
448+
https://www.pgscatalog.org/downloads/#dl_ftp_scoring
449+
450+
>>> from ._config import Config
451+
>>> testpath = Config.ROOT_DIR / "tests" / "data" / "PGS000001_hmPOS_GRCh38.txt.gz"
452+
>>> ScoreHeader.from_path(testpath) # doctest: +ELLIPSIS
453+
ScoreHeader(format_version=<ScoreFormatVersion.v2: '2.0'>, pgs_id='PGS000001', pgs_name='PRS77_BC', trait_reported='Breast cancer', trait_mapped=['breast carcinoma'], trait_efo=['EFO_0000305'], genome_build=None, variants_number=77, weight_type=None)
454+
"""
455+
456+
format_version: ScoreFormatVersion
457+
trait_mapped: list[str] = Field(description="Trait name")
458+
trait_efo: list[str] = Field(
459+
description="Ontology trait name, e.g. 'breast carcinoma"
460+
)
461+
variants_number: int = Field(
462+
gt=0, description="Number of variants listed in the PGS", default=None
463+
)
464+
# note: we'll make sure to serialise None values here and in genome_build as string "NR"
465+
weight_type: Optional[WeightType] = Field(
466+
description="Variant weight type", default=None
467+
)
468+
469+
##SOURCE INFORMATION
470+
pgp_id: str
471+
citation: str
472+
##HARMONIZATION DETAILS
473+
HmPOS_build: Optional[GenomeBuild] = Field(default=None)
474+
HmPOS_date: Optional[date] = Field(default=None)
475+
HmPOS_match_pos: Optional[str] = Field(default=None)
476+
HmPOS_match_chr: Optional[str] = Field(default=None)
477+
478+
# note: only included when different from default
479+
license: Optional[str] = Field(
480+
"PGS obtained from the Catalog should be cited appropriately, and "
481+
"used in accordance with any licensing restrictions set by the authors. See "
482+
"EBI Terms of Use (https://www.ebi.ac.uk/about/terms-of-use/) for additional "
483+
"details.",
484+
repr=False,
485+
)
486+
487+
@field_validator("trait_mapped", "trait_efo", mode="before")
488+
@classmethod
489+
def split_traits(cls, trait: str) -> list[str]:
490+
if isinstance(trait, str):
491+
traits = trait.split("|")
492+
if len(traits) == 0:
493+
raise ValueError("No traits defined")
494+
return traits
495+
raise ValueError(f"Can't parse trait string: {trait}")
496+
497+
@classmethod
498+
def _check_accession(cls, value: str, prefix: str) -> str:
499+
if not value.startswith(prefix):
500+
raise ValueError(f"{value} doesn't start with {prefix}")
501+
if len(value) != 9:
502+
raise ValueError(f"Invalid accession format: {value}")
503+
return value
504+
505+
@field_validator("pgs_id")
506+
@classmethod
507+
def check_pgs_id(cls, pgs_id: str) -> str:
508+
return cls._check_accession(pgs_id, "PGS")
509+
510+
@field_validator("pgs_id")
511+
@classmethod
512+
def check_pgp_id(cls, pgp_id: str) -> str:
513+
return cls._check_accession(pgp_id, "PGP")
514+
515+
@field_validator("genome_build", "HmPOS_build", mode="before")
516+
@classmethod
517+
def parse_genome_build(cls, value: str) -> Optional[GenomeBuild]:
518+
return cls._parse_genome_build(value)
519+
520+
@field_validator("format_version")
521+
@classmethod
522+
def check_format_version(cls, version: ScoreFormatVersion) -> ScoreFormatVersion:
523+
if version != ScoreFormatVersion.v2:
524+
raise ValueError(f"Invalid format_version: {version}")
525+
return version
526+
527+
@field_validator("weight_type")
528+
@classmethod
529+
def parse_weight_type(cls, value: WeightType) -> Optional[WeightType]:
530+
if value == WeightType.NR:
531+
value = None
532+
return value

0 commit comments

Comments
 (0)