make pytest strict: fail on warnings

nebfield · nebfield · commit 634b7ed4014f · 2024-03-01T14:52:19.000Z
diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml
@@ -34,5 +34,5 @@ jobs:
       - run: poetry install --with dev --all-extras
         working-directory: ${{ inputs.package-directory }}
 
-      - run: poetry run pytest --doctest-modules
+      - run: poetry run pytest
         working-directory: ${{ inputs.package-directory }}
diff --git a/pgscatalog.calcapp/pyproject.toml b/pgscatalog.calcapp/pyproject.toml
@@ -21,3 +21,11 @@ pgscatalog-aggregate = 'pgscatalog.downloadapp.cli:run'
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q"
+testpaths = [
+    "tests"
+]
+filterwarnings = ["error"]
diff --git a/pgscatalog.calcapp/tests/test_aggregate.py b/pgscatalog.calcapp/tests/test_aggregate.py
@@ -27,13 +27,8 @@ def test_split_aggregate(tmp_path_factory, scorefiles):
     outf = list(outdir.glob("*.txt.gz"))
     assert [x.name for x in outf] == ["hgdp_pgs.txt.gz"]
     outdf = pd.read_csv(outf[0], sep="\t")
-    assert list(outdf.columns) == [
-        "sampleset",
-        "IID",
-        "DENOM",
-        "PGS001229_hmPOS_GRCh38_SUM",
-    ]
-    assert outdf.shape == (929, 4)
+    assert list(outdf.columns) == ["sampleset", "IID", "DENOM", "PGS", "SUM"]
+    assert outdf.shape == (929, 5)
 
 
 def test_nosplit_aggregate(tmp_path_factory, scorefiles):
@@ -58,10 +53,5 @@ def test_nosplit_aggregate(tmp_path_factory, scorefiles):
     outf = list(outdir.glob("*.txt.gz"))
     assert [x.name for x in outf] == ["aggregated_scores.txt.gz"]
     outdf = pd.read_csv(outf[0], sep="\t")
-    assert list(outdf.columns) == [
-        "sampleset",
-        "IID",
-        "DENOM",
-        "PGS001229_hmPOS_GRCh38_SUM",
-    ]
-    assert outdf.shape == (929, 4)
+    assert list(outdf.columns) == ["sampleset", "IID", "DENOM", "PGS", "SUM"]
+    assert outdf.shape == (929, 5)
diff --git a/pgscatalog.calclib/pyproject.toml b/pgscatalog.calclib/pyproject.toml
@@ -24,3 +24,8 @@ pytest = "^8.0.0"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q --doctest-modules"
+filterwarnings = ["error"]
diff --git a/pgscatalog.calclib/src/pgscatalog/calclib/polygenicscore.py b/pgscatalog.calclib/src/pgscatalog/calclib/polygenicscore.py
@@ -2,9 +2,6 @@
 
 import pandas as pd
 
-import reprlib
-
-
 
 class PolygenicScore:
     """Represents the output of plink2 --score written to a file
@@ -13,7 +10,7 @@ class PolygenicScore:
     >>> score1 = Config.ROOT_DIR / "tests" / "cineca_22_additive_0.sscore.zst"
     >>> pgs1 = PolygenicScore(sampleset="test", path=score1)  # doctest: +ELLIPSIS
     >>> pgs1
-    PolygenicScore(sampleset='test', path=PosixPath('.../cineca_22_additive_0.sscore.zst'), df=None)
+    PolygenicScore(sampleset='test', path=PosixPath('.../cineca_22_additive_0.sscore.zst'))
     >>> pgs2 = PolygenicScore(sampleset="test", path=score1)
     >>> pgs1.read().to_dict()  # doctest: +ELLIPSIS
     {'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 0.54502, ('test', 'HG00097'): 0.674401, ('test', 'HG00099'): 0.63727, ('test', 'HG00100'): 0.863944, ...}}
@@ -22,7 +19,7 @@ class PolygenicScore:
 
     >>> aggregated_score = pgs1 + pgs2
     >>> aggregated_score  # doctest: +ELLIPSIS
-    PolygenicScore(sampleset='test', path=None, df={'DENOM': ...}, 'PGS001229_22_SUM': {('test', 'HG00096'): 1.09004, ('test', 'HG00097'): 1.348802, ('test', 'HG00099'): 1.27454, ('test', 'HG00100'): 1.727888, ...}})
+    PolygenicScore(sampleset='test', path=None)
 
     Once a score has been fully aggregated it can be helpful to recalculate an average:
 
@@ -45,7 +42,7 @@ class PolygenicScore:
     ['test_pgs.txt.gz']
     """
 
-    def __init__(self, *, sampleset, path=None, df=None):
+    def __init__(self, *, path=None, df=None, sampleset=None):
         match (path, df):
             case (None, None):
                 raise ValueError("init with path or df")
@@ -164,6 +161,7 @@ def _select_agg_cols(cols):
         if (x.endswith("_SUM") and (x != "NAMED_ALLELE_DOSAGE_SUM")) or (x in keep_cols)
     ]
 
+
 def _melt(df, value_name):
     """Melt the score dataframe from wide format to long format"""
     df = df.melt(
@@ -175,4 +173,3 @@ def _melt(df, value_name):
     # e.g. PGS000822_SUM -> PGS000822
     df["PGS"] = df["PGS"].str.replace(f"_{value_name}", "")
     return df
-
diff --git a/pgscatalog.combineapp/pyproject.toml b/pgscatalog.combineapp/pyproject.toml
@@ -25,7 +25,9 @@ build-backend = "poetry.core.masonry.api"
 pgscatalog-combine = 'pgscatalog.combineapp.cli:run'
 
 [tool.pytest.ini_options]
-pythonpath = [
-  "src"
+minversion = "6.0"
+addopts = "-ra -q"
+testpaths = [
+    "tests"
 ]
-
+filterwarnings = ["error"]
diff --git a/pgscatalog.corelib/pyproject.toml b/pgscatalog.corelib/pyproject.toml
@@ -27,3 +27,8 @@ pyarrow = ["pyarrow"]
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q --doctest-modules"
+filterwarnings = ["error"]
diff --git a/pgscatalog.corelib/src/pgscatalog/corelib/scorefiles.py b/pgscatalog.corelib/src/pgscatalog/corelib/scorefiles.py
@@ -291,7 +291,7 @@ def _init_from_accession(self, accession, target_build):
 
     def _init_from_path(self, target_build=None):
         logger.debug(f"Instantiating ScoringFile from {self.local_path=}")
-  
+
         if target_build is not None:
             raise ValueError(
                 "target_build must be None for local files. "
@@ -660,8 +660,6 @@ def __init__(self, *args, target_build=None, **kwargs):
                         f"{arg.target_build=} doesn't match {target_build=}"
                     )
                 case _ if pathlib.Path(arg).is_file() and target_build is None:
-                    scorefiles.append(ScoringFile(arg))
-                case _ if pathlib.Path(arg).is_file() and target_build is not None:
                     logger.info(f"Local path: {arg}, no target build is OK")
                     scorefiles.append(ScoringFile(arg))
                 case _ if pathlib.Path(arg).is_file() and target_build is not None:
@@ -794,7 +792,8 @@ class NormalisedScoringFile:
 
     def __init__(self, path):
         try:
-            xopen(path)
+            with xopen(path):
+                pass
         except TypeError:
             self.path = False
         else:
diff --git a/pgscatalog.downloadapp/pyproject.toml b/pgscatalog.downloadapp/pyproject.toml
@@ -25,8 +25,9 @@ build-backend = "poetry.core.masonry.api"
 pgscatalog-download = 'pgscatalog.downloadapp.cli:run'
 
 [tool.pytest.ini_options]
-
-pythonpath = [
-  "src"
+minversion = "6.0"
+addopts = "-ra -q"
+testpaths = [
+    "tests"
 ]
-
+filterwarnings = ["error"]
diff --git a/pgscatalog.matchapp/pyproject.toml b/pgscatalog.matchapp/pyproject.toml
@@ -25,3 +25,11 @@ pgscatalog-matchmerge = 'pgscatalog.matchapp.merge_cli:run_merge'
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q"
+testpaths = [
+    "tests"
+]
+filterwarnings = ["error"]
diff --git a/pgscatalog.matchlib/pyproject.toml b/pgscatalog.matchlib/pyproject.toml
@@ -21,3 +21,8 @@ pytest = "^8.0.0"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q --doctest-modules"
+filterwarnings = ["error"]
diff --git a/pgscatalog.matchlib/src/pgscatalog/matchlib/_match/label.py b/pgscatalog.matchlib/src/pgscatalog/matchlib/_match/label.py
@@ -52,12 +52,12 @@ def _encode_match_priority(df: pl.LazyFrame) -> pl.LazyFrame:
     return (
         df.with_columns(
             # set false best match to not_best
-            match_priority=pl.col("best_match").apply(
+            match_priority=pl.col("best_match").map_elements(
                 lambda x: {None: 0, True: 1, False: 3}[x]
             )
         )
         .with_columns(
-            excluded_match_priority=pl.col("exclude").apply(
+            excluded_match_priority=pl.col("exclude").map_elements(
                 lambda x: {None: 0, True: 2, False: 0}[x]
             )
         )
@@ -66,7 +66,7 @@ def _encode_match_priority(df: pl.LazyFrame) -> pl.LazyFrame:
         )
         .with_columns(
             match_status=pl.col("max")
-            .apply(
+            .map_elements(
                 lambda x: {0: "unmatched", 1: "matched", 2: "excluded", 3: "not_best"}[
                     x
                 ]
@@ -140,7 +140,7 @@ def _label_duplicate_best_match(df: pl.LazyFrame) -> pl.LazyFrame:
             .otherwise(pl.lit(False))
         )
         .drop("count")
-        .with_row_count(
+        .with_row_index(
             name="temp_row_nr"
         )  # add temporary row count to get first variant
         .with_columns(
diff --git a/pgscatalog.matchlib/src/pgscatalog/matchlib/_match/log.py b/pgscatalog.matchlib/src/pgscatalog/matchlib/_match/log.py
@@ -43,7 +43,8 @@ def make_summary_log(
             pl.col("match_status").fill_null(value="unmatched"), dataset=pl.lit(dataset)
         )  # fill in unmatched variants
         .group_by(cols)
-        .count()
+        .len()
+        .rename({"len": "count"})
         .join(filter_summary, how="left", on="accession")
         .pipe(_prettify_summary)
     )
@@ -55,7 +56,8 @@ def check_log_count(scorefile: pl.LazyFrame, summary_log: pl.LazyFrame):
 
     log_count: pl.DataFrame = (
         scorefile.group_by("accession")
-        .count()
+        .len()
+        .rename({"len": "count"})
         .join(summary_count, on="accession")
         .collect()
     )
diff --git a/pgscatalog.matchlib/src/pgscatalog/matchlib/_plinkframe.py b/pgscatalog.matchlib/src/pgscatalog/matchlib/_plinkframe.py
@@ -28,8 +28,8 @@ def __repr__(self):
 
     def split_pivot(self):
         """Splitting scoring files is helpful for split - apply - combine on big data"""
-        dfs = self.df.collect().partition_by("chr_name", as_dict=True)
-        return {k: v.pipe(pivot_score) for k, v in dfs.items()}
+        dfs = self.df.collect().partition_by(["chr_name"], as_dict=True)
+        return {k[0]: v.pipe(pivot_score) for k, v in dfs.items()}
 
     def pivot_wide(self):
         """Pivoting wide is important to enable parallel score calculation"""

Original file line number	Diff line number	Diff line change
`@@ -52,12 +52,12 @@ def _encode_match_priority(df: pl.LazyFrame) -> pl.LazyFrame:`
`52`	`52`	`return (`
`53`	`53`	`df.with_columns(`
`54`	`54`	`# set false best match to not_best`
`55`		`- match_priority=pl.col("best_match").apply(`
	`55`	`+ match_priority=pl.col("best_match").map_elements(`
`56`	`56`	`lambda x: {None: 0, True: 1, False: 3}[x]`
`57`	`57`	`)`
`58`	`58`	`)`
`59`	`59`	`.with_columns(`
`60`		`- excluded_match_priority=pl.col("exclude").apply(`
	`60`	`+ excluded_match_priority=pl.col("exclude").map_elements(`
`61`	`61`	`lambda x: {None: 0, True: 2, False: 0}[x]`
`62`	`62`	`)`
`63`	`63`	`)`
`@@ -66,7 +66,7 @@ def _encode_match_priority(df: pl.LazyFrame) -> pl.LazyFrame:`
`66`	`66`	`)`
`67`	`67`	`.with_columns(`
`68`	`68`	`match_status=pl.col("max")`
`69`		`- .apply(`
	`69`	`+ .map_elements(`
`70`	`70`	`lambda x: {0: "unmatched", 1: "matched", 2: "excluded", 3: "not_best"}[`
`71`	`71`	`x`
`72`	`72`	`]`
`@@ -140,7 +140,7 @@ def _label_duplicate_best_match(df: pl.LazyFrame) -> pl.LazyFrame:`
`140`	`140`	`.otherwise(pl.lit(False))`
`141`	`141`	`)`
`142`	`142`	`.drop("count")`
`143`		`- .with_row_count(`
	`143`	`+ .with_row_index(`
`144`	`144`	`name="temp_row_nr"`
`145`	`145`	`) # add temporary row count to get first variant`
`146`	`146`	`.with_columns(`