set up type checks for pgscatalog.core

nebfield · nebfield · commit 5e3bdf93c895 · 2024-09-19T10:35:26.000+01:00
diff --git a/pgscatalog.core/poetry.lock b/pgscatalog.core/poetry.lock
diff --git a/pgscatalog.core/pyproject.toml b/pgscatalog.core/pyproject.toml
@@ -28,6 +28,7 @@ pydantic = "^2.9.0"
 pytest = "^7.4.4"
 sphinx-autoapi = "^3.0.0"
 pytest-cov = "^4.1.0"
+mypy = "^1.11.2"
 
 [build-system]
 requires = ["poetry-core"]
@@ -39,4 +40,4 @@ addopts = "-ra -q --doctest-modules"
 filterwarnings = ["error"]
 
 [tool.coverage.run]
-source = ['src/pgscatalog/core']
+source = ['src/pgscatalog/core']
diff --git a/pgscatalog.core/src/pgscatalog/core/__init__.py b/pgscatalog.core/src/pgscatalog/core/__init__.py
@@ -1,4 +1,5 @@
-""" Public interface to the Polygenic Score Catalog core package """
+"""Public interface to the Polygenic Score Catalog core package"""
+
 import logging
 import importlib.metadata
 
diff --git a/pgscatalog.core/src/pgscatalog/core/lib/_read.py b/pgscatalog.core/src/pgscatalog/core/lib/_read.py
@@ -3,6 +3,7 @@
 
 import logging
 import pathlib
+from typing import Generator, Iterator
 
 from xopen import xopen
 
@@ -12,8 +13,13 @@
 
 
 def read_rows_lazy(
-    *, csv_reader, fields: list[str], name: str, wide: bool, row_nr: int
-):
+    *,
+    csv_reader: Iterator[list[str]],
+    fields: list[str],
+    name: str,
+    wide: bool,
+    row_nr: int,
+) -> Generator[ScoreVariant, None, None]:
     """Read rows from an open scoring file and instantiate them as ScoreVariants"""
     for row in csv_reader:
         variant = dict(zip(fields, row))
@@ -81,7 +87,7 @@ def detect_wide(cols: list[str]) -> bool:
         return False
 
 
-def read_header(path: pathlib.Path):
+def read_header(path: pathlib.Path) -> dict:
     """Parses the header of a PGS Catalog format scoring file into a dictionary"""
     header = {}
 
diff --git a/pgscatalog.core/src/pgscatalog/core/lib/_sortpaths.py b/pgscatalog.core/src/pgscatalog/core/lib/_sortpaths.py
@@ -1,7 +1,8 @@
-""" This module assumes you're working with paths that follow the format:
+"""This module assumes you're working with paths that follow the format:
 
 {sampleset}_{chrom}_{effect_type}_{n}
 """
+
 from natsort import natsort_keygen, ns
 
 
diff --git a/pgscatalog.core/src/pgscatalog/core/lib/models.py b/pgscatalog.core/src/pgscatalog/core/lib/models.py
@@ -1,4 +1,4 @@
-""" PGS Catalog pydantic models for data validation
+"""PGS Catalog pydantic models for data validation
 
 Best way to reuse:
 
@@ -7,9 +7,10 @@
   * `import pgscatalog.core` and use fully qualified name: `pgscatalog.core.models.CatalogScoreVariant`)
 
 """
+
 from functools import cached_property
 from typing import ClassVar, Optional
-from typing_extensions import Self
+from typing_extensions import Self, Literal
 
 from pydantic import (
     BaseModel,
@@ -46,7 +47,7 @@ class Allele(BaseModel):
     allele: str
     _valid_snp_bases: ClassVar[frozenset[str]] = frozenset({"A", "C", "T", "G"})
 
-    @computed_field
+    @computed_field  # type: ignore
     @cached_property
     def is_snp(self) -> bool:
         """SNPs are the most common type of effect allele in PGS Catalog scoring
@@ -242,23 +243,35 @@ class CatalogScoreVariant(BaseModel):
     )
 
     # helpful class attributes (not used by pydantic to instantiate a class)
-    harmonised_columns: ClassVar[tuple[str]] = (
+    harmonised_columns: ClassVar[
+        tuple[Literal["hm_rsID"], Literal["hm_chr"], Literal["hm_pos"]]
+    ] = (
         "hm_rsID",
         "hm_chr",
         "hm_pos",
     )  # it's OK if (""hm_source", "hm_inferOtherAllele", "hm_match_chr", "hm_match_pos") are missing
-    complex_columns: ClassVar[tuple[str]] = (
+    complex_columns: ClassVar[
+        tuple[
+            Literal["is_haplotype"], Literal["is_diplotype"], Literal["is_interaction"]
+        ]
+    ] = (
         "is_haplotype",
         "is_diplotype",
         "is_interaction",
     )
-    non_additive_columns: ClassVar[tuple[str]] = (
+    non_additive_columns: ClassVar[
+        tuple[
+            Literal["dosage_0_weight"],
+            Literal["dosage_1_weight"],
+            Literal["dosage_2_weight"],
+        ]
+    ] = (
         "dosage_0_weight",
         "dosage_1_weight",
         "dosage_2_weight",
     )
 
-    @computed_field
+    @computed_field  # type: ignore
     @cached_property
     def variant_id(self) -> str:
         """ID = chr:pos:effect_allele:other_allele"""
@@ -269,7 +282,7 @@ def variant_id(self) -> str:
             ]
         )
 
-    @computed_field
+    @computed_field  # type: ignore
     @cached_property
     def is_harmonised(self) -> bool:
         # simple check: do any of the harmonised columns have data?
@@ -278,7 +291,7 @@ def is_harmonised(self) -> bool:
                 return True
         return False
 
-    @computed_field
+    @computed_field  # type: ignore
     @cached_property
     def is_complex(self) -> bool:
         # checking flag fields here, which are defaulted to False
@@ -287,7 +300,7 @@ def is_complex(self) -> bool:
                 return True
         return False
 
-    @computed_field
+    @computed_field  # type: ignore
     @cached_property
     def is_non_additive(self) -> bool:
         # simple check: do any of the weight dosage columns have data?
@@ -296,7 +309,7 @@ def is_non_additive(self) -> bool:
                 return True
         return False
 
-    @computed_field
+    @computed_field  # type: ignore
     @cached_property
     def effect_type(self) -> EffectType:
         match (self.is_recessive, self.is_dominant, self.is_non_additive):
diff --git a/pgscatalog.core/src/pgscatalog/core/lib/scorevariant.py b/pgscatalog.core/src/pgscatalog/core/lib/scorevariant.py
@@ -1,4 +1,4 @@
-from typing import Optional, ClassVar
+from typing import Optional, ClassVar, Literal
 from pydantic import (
     Field,
     field_serializer,
@@ -62,7 +62,19 @@ class ScoreVariant(CatalogScoreVariant):
     )
 
     # column names for output are used by __iter__ and when writing out
-    output_fields: ClassVar[tuple[str]] = (
+    output_fields: ClassVar[
+        tuple[
+            Literal["chr_name"],
+            Literal["chr_position"],
+            Literal["effect_allele"],
+            Literal["other_allele"],
+            Literal["effect_weight"],
+            Literal["effect_type"],
+            Literal["is_duplicated"],
+            Literal["accession"],
+            Literal["row_nr"],
+        ]
+    ] = (
         "chr_name",
         "chr_position",
         "effect_allele",