@@ -382,73 +382,49 @@ class ScoreFormatVersion(str, enum.Enum):
382
382
v2 = "2.0"
383
383
384
384
385
- class CatalogScoreHeader (BaseModel ):
386
- """Headers store useful metadata about a scoring file.
385
+ class WeightType (str , enum .Enum ):
386
+ BETA = "beta"
387
+ ODDSRATIO = "OR"
388
+ HAZARDRATIO = "HR"
389
+ NR = "NR"
387
390
388
- This class provides convenient functions for reading and extracting information
389
- from the header. The header must follow PGS Catalog standards. It's always best
390
- to build headers with ``from_path()``:
391
391
392
- >>> from ._config import Config
393
- >>> testpath = Config.ROOT_DIR / "tests" / "data" / "PGS000001_hmPOS_GRCh38.txt.gz"
394
- >>> CatalogScoreHeader.from_path(testpath) # doctest: +ELLIPSIS
395
- CatalogScoreHeader(format_version=<ScoreFormatVersion.v2: '2.0'>, pgs_id='PGS000001', pgs_name='PRS77_BC', trait_reported='Breast cancer', trait_mapped='breast carcinoma', trait_efo='EFO_0000305', genome_build=None, variants_number=77, weight_type='NR', pgp_id='PGP000001', citation='Mavaddat N et al. J Natl Cancer Inst (2015). doi:10.1093/jnci/djv036', HmPOS_build=GenomeBuild.GRCh38, HmPOS_date=datetime.date(2022, 7, 29), HmPOS_match_pos='{"True": null, "False": null}', HmPOS_match_chr='{"True": null, "False": null}')
392
+ class ScoreHeader (BaseModel ):
393
+ """Headers store useful metadata about a scoring file.
396
394
397
- Harmonisation fields are optional:
395
+ Data validation is less strict than the CatalogScoreHeader, to make
396
+ it easier for people to use custom scoring files with the PGS Catalog Calculator.
398
397
399
- >>> CatalogScoreHeader (**{"format_version": "2.0", " pgs_id": "PGS123456", "pgs_name": "testpgs", " trait_reported": "testtrait", "trait_mapped": "testtrait", "trait_efo": "testtrait", " genome_build": "NR", "variants_number": 2, "weight_type": "NR", "pgp_id": "PGP123456", "citation": "yes please "})
400
- CatalogScoreHeader(format_version=<ScoreFormatVersion.v2: '2.0'>, pgs_id='PGS123456', pgs_name='testpgs' , trait_reported='testtrait', trait_mapped='testtrait', trait_efo='testtrait', genome_build=None, variants_number=2, weight_type='NR', pgp_id='PGP123456', citation='yes please', HmPOS_build=None, HmPOS_date=None, HmPOS_match_pos=None, HmPOS_match_chr=None )
398
+ >>> ScoreHeader (**{"pgs_id": "PGS123456", "trait_reported": "testtrait", "genome_build": "GRCh38 "})
399
+ ScoreHeader( pgs_id='PGS123456', pgs_name=None , trait_reported='testtrait', genome_build=GenomeBuild.GRCh38 )
401
400
"""
402
401
403
- ###PGS CATALOG SCORING FILE - see https://www.pgscatalog.org/downloads/#dl_ftp_scoring for additional information
404
- format_version : ScoreFormatVersion
405
- ##POLYGENIC SCORE (PGS) INFORMATION
406
- pgs_id : str
407
- pgs_name : str
408
- trait_reported : str
409
- trait_mapped : str
410
- trait_efo : str
411
- genome_build : Optional [GenomeBuild ]
412
- variants_number : int = Field (ge = 0 )
413
- weight_type : str
414
- ##SOURCE INFORMATION
415
- pgp_id : str
416
- citation : str
417
- ##HARMONIZATION DETAILS
418
- HmPOS_build : Optional [GenomeBuild ] = None
419
- HmPOS_date : Optional [date ] = None
420
- HmPOS_match_pos : Optional [str ] = None
421
- HmPOS_match_chr : Optional [str ] = None
422
- # note: only included when different from default
423
- license : Optional [str ] = Field (
424
- "PGS obtained from the Catalog should be cited appropriately, and "
425
- "used in accordance with any licensing restrictions set by the authors. See "
426
- "EBI Terms of Use (https://www.ebi.ac.uk/about/terms-of-use/) for additional "
427
- "details." ,
428
- repr = False ,
429
- )
402
+ pgs_id : str = Field (title = "PGS identifier" )
403
+ pgs_name : Optional [str ] = Field (description = "PGS name" , default = None )
404
+ trait_reported : str = Field (description = "Trait name" )
405
+ # genome build is Optional because "NR" is represented internally as None
406
+ genome_build : Optional [GenomeBuild ] = Field (description = "Genome build" )
430
407
431
- @field_validator ("pgs_id" )
432
408
@classmethod
433
- def check_pgs_id (cls , pgs_id : str ) -> str :
434
- if not pgs_id .startswith ("PGS" ):
435
- raise ValueError (f"pgs_id doesn't start with PGS: { pgs_id } " )
436
- if len (pgs_id ) != 9 :
437
- raise ValueError (f"Invalid PGS ID format: { pgs_id } " )
438
- return pgs_id
409
+ def _parse_genome_build (cls , value : str ) -> Optional [GenomeBuild ]:
410
+ if value == "NR" :
411
+ return None
412
+ else :
413
+ return GenomeBuild .from_string (value )
439
414
440
415
@field_validator ("genome_build" , mode = "before" )
441
416
@classmethod
442
- def parse_genome_build (cls , weight : str ) -> GenomeBuild :
443
- return GenomeBuild . from_string ( weight )
417
+ def parse_genome_build (cls , value : str ) -> Optional [ GenomeBuild ] :
418
+ return cls . _parse_genome_build ( value )
444
419
445
420
@classmethod
446
421
def from_path (cls , path ):
447
- # TODO: I copied some library functions here for testing, clean em up or unify here
448
422
header = {}
449
423
450
- def generate_header (path ):
424
+ def generate_header (f ):
451
425
for line in f :
426
+ if line .startswith ("##" ):
427
+ continue
452
428
if line .startswith ("#" ):
453
429
if "=" in line :
454
430
yield line .strip ()
@@ -464,3 +440,93 @@ def generate_header(path):
464
440
header [key [1 :]] = value # drop # character from key
465
441
466
442
return cls (** header )
443
+
444
+
445
+ class CatalogScoreHeader (ScoreHeader ):
446
+ """A ScoreHeader that validates the PGS Catalog Scoring File header standard
447
+
448
+ https://www.pgscatalog.org/downloads/#dl_ftp_scoring
449
+
450
+ >>> from ._config import Config
451
+ >>> testpath = Config.ROOT_DIR / "tests" / "data" / "PGS000001_hmPOS_GRCh38.txt.gz"
452
+ >>> ScoreHeader.from_path(testpath) # doctest: +ELLIPSIS
453
+ ScoreHeader(format_version=<ScoreFormatVersion.v2: '2.0'>, pgs_id='PGS000001', pgs_name='PRS77_BC', trait_reported='Breast cancer', trait_mapped=['breast carcinoma'], trait_efo=['EFO_0000305'], genome_build=None, variants_number=77, weight_type=None)
454
+ """
455
+
456
+ format_version : ScoreFormatVersion
457
+ trait_mapped : list [str ] = Field (description = "Trait name" )
458
+ trait_efo : list [str ] = Field (
459
+ description = "Ontology trait name, e.g. 'breast carcinoma"
460
+ )
461
+ variants_number : int = Field (
462
+ gt = 0 , description = "Number of variants listed in the PGS" , default = None
463
+ )
464
+ # note: we'll make sure to serialise None values here and in genome_build as string "NR"
465
+ weight_type : Optional [WeightType ] = Field (
466
+ description = "Variant weight type" , default = None
467
+ )
468
+
469
+ ##SOURCE INFORMATION
470
+ pgp_id : str
471
+ citation : str
472
+ ##HARMONIZATION DETAILS
473
+ HmPOS_build : Optional [GenomeBuild ] = Field (default = None )
474
+ HmPOS_date : Optional [date ] = Field (default = None )
475
+ HmPOS_match_pos : Optional [str ] = Field (default = None )
476
+ HmPOS_match_chr : Optional [str ] = Field (default = None )
477
+
478
+ # note: only included when different from default
479
+ license : Optional [str ] = Field (
480
+ "PGS obtained from the Catalog should be cited appropriately, and "
481
+ "used in accordance with any licensing restrictions set by the authors. See "
482
+ "EBI Terms of Use (https://www.ebi.ac.uk/about/terms-of-use/) for additional "
483
+ "details." ,
484
+ repr = False ,
485
+ )
486
+
487
+ @field_validator ("trait_mapped" , "trait_efo" , mode = "before" )
488
+ @classmethod
489
+ def split_traits (cls , trait : str ) -> list [str ]:
490
+ if isinstance (trait , str ):
491
+ traits = trait .split ("|" )
492
+ if len (traits ) == 0 :
493
+ raise ValueError ("No traits defined" )
494
+ return traits
495
+ raise ValueError (f"Can't parse trait string: { trait } " )
496
+
497
+ @classmethod
498
+ def _check_accession (cls , value : str , prefix : str ) -> str :
499
+ if not value .startswith (prefix ):
500
+ raise ValueError (f"{ value } doesn't start with { prefix } " )
501
+ if len (value ) != 9 :
502
+ raise ValueError (f"Invalid accession format: { value } " )
503
+ return value
504
+
505
+ @field_validator ("pgs_id" )
506
+ @classmethod
507
+ def check_pgs_id (cls , pgs_id : str ) -> str :
508
+ return cls ._check_accession (pgs_id , "PGS" )
509
+
510
+ @field_validator ("pgs_id" )
511
+ @classmethod
512
+ def check_pgp_id (cls , pgp_id : str ) -> str :
513
+ return cls ._check_accession (pgp_id , "PGP" )
514
+
515
+ @field_validator ("genome_build" , "HmPOS_build" , mode = "before" )
516
+ @classmethod
517
+ def parse_genome_build (cls , value : str ) -> Optional [GenomeBuild ]:
518
+ return cls ._parse_genome_build (value )
519
+
520
+ @field_validator ("format_version" )
521
+ @classmethod
522
+ def check_format_version (cls , version : ScoreFormatVersion ) -> ScoreFormatVersion :
523
+ if version != ScoreFormatVersion .v2 :
524
+ raise ValueError (f"Invalid format_version: { version } " )
525
+ return version
526
+
527
+ @field_validator ("weight_type" )
528
+ @classmethod
529
+ def parse_weight_type (cls , value : WeightType ) -> Optional [WeightType ]:
530
+ if value == WeightType .NR :
531
+ value = None
532
+ return value
0 commit comments