From 9409a3023bb3643ccbbe5804e559c1b8d32774cd Mon Sep 17 00:00:00 2001
From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com>
Date: Tue, 22 Feb 2022 11:05:54 -0500
Subject: [PATCH 01/10] add `ignore_nan` arg to `utils.matrix_transform`

---
 scprep/utils.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/scprep/utils.py b/scprep/utils.py
index 23a174b4..18462ab2 100644
--- a/scprep/utils.py
+++ b/scprep/utils.py
@@ -378,7 +378,7 @@ def matrix_transform(data, fun, *args, **kwargs):
     return data
 
 
-def matrix_sum(data, axis=None):
+def matrix_sum(data, axis=None, ignore_nan=False):
     """Get the column-wise, row-wise, or total sum of values in a matrix.
 
     Parameters
@@ -388,37 +388,40 @@ def matrix_sum(data, axis=None):
     axis : int or None, optional (default: None)
         Axis across which to sum. axis=0 gives column sums,
         axis=1 gives row sums. None gives the total sum.
+    ignore_nan : bool, optional (default: False)
+        If True, uses `np.nansum` instead of `np.sum`
 
     Returns
     -------
     sums : array-like or float
         Sums along desired axis.
     """
+    sum_fn = np.nansum if ignore_nan else np.sum
     if axis not in [0, 1, None]:
         raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis))
     if isinstance(data, pd.DataFrame):
         if is_SparseDataFrame(data):
             if axis is None:
-                sums = data.to_coo().sum()
+                sums = sum_fn(data.to_coo())
             else:
                 index = data.index if axis == 1 else data.columns
                 sums = pd.Series(
-                    np.array(data.to_coo().sum(axis)).flatten(), index=index
+                    np.array(sum_fn(data.to_coo(), axis)).flatten(), index=index
                 )
         elif is_sparse_dataframe(data):
             if axis is None:
-                sums = data.sparse.to_coo().sum()
+                sums = sum_fn(data.sparse.to_coo())
             else:
                 index = data.index if axis == 1 else data.columns
                 sums = pd.Series(
-                    np.array(data.sparse.to_coo().sum(axis)).flatten(), index=index
+                    np.array(sum_fn(data.sparse.to_coo(), axis)).flatten(), index=index
                 )
         elif axis is None:
-            sums = data.to_numpy().sum()
+            sums = sum_fn(data.to_numpy())
         else:
-            sums = data.sum(axis)
+            sums = sum_fn(data, axis)
     else:
-        sums = np.sum(data, axis=axis)
+        sums = sum_fn(data, axis=axis)
         if isinstance(sums, np.matrix):
             sums = np.array(sums).flatten()
     return sums

From 8245536f01ea58a3e922220cdbb4c850eec73f5b Mon Sep 17 00:00:00 2001
From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com>
Date: Tue, 22 Feb 2022 11:08:43 -0500
Subject: [PATCH 02/10] Add `ignore_nan` to `pairwise_correlation`

---
 scprep/stats.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/scprep/stats.py b/scprep/stats.py
index 7f4f4faa..e2228088 100644
--- a/scprep/stats.py
+++ b/scprep/stats.py
@@ -50,7 +50,7 @@ def EMD(x, y):
     return stats.wasserstein_distance(x, y)
 
 
-def pairwise_correlation(X, Y):
+def pairwise_correlation(X, Y, ignore_nan=False):
     """Compute pairwise Pearson correlation between columns of two matrices.
 
     From https://stackoverflow.com/a/33651442/3996580
@@ -61,6 +61,8 @@ def pairwise_correlation(X, Y):
         Input data
     Y : array-like, shape=[n_samples, p_features]
         Input data
+    ignore_nan : bool, optional (default: False)
+        If True, ignore NaNs, computing correlation over remaining values
 
     Returns
     -------
@@ -78,18 +80,22 @@ def pairwise_correlation(X, Y):
     if sparse.issparse(Y) and not sparse.issparse(X):
         X = sparse.csr_matrix(X)
     # Store columnw-wise in X and Y, as they would be used at few places
-    X_colsums = utils.matrix_sum(X, axis=0)
-    Y_colsums = utils.matrix_sum(Y, axis=0)
+    X_colsums = utils.matrix_sum(X, axis=0, ignore_nan=ignore_nan)
+    Y_colsums = utils.matrix_sum(Y, axis=0, ignore_nan=ignore_nan)
     # Basically there are four parts in the formula. We would compute them
     # one-by-one
     N_times_sum_xy = utils.toarray(N * Y.T.dot(X))
     sum_x_times_sum_y = X_colsums * Y_colsums[:, None]
-    var_x = N * utils.matrix_sum(utils.matrix_transform(X, np.power, 2), axis=0) - (
-        X_colsums ** 2
-    )
-    var_y = N * utils.matrix_sum(utils.matrix_transform(Y, np.power, 2), axis=0) - (
-        Y_colsums ** 2
-    )
+    var_x = N * utils.matrix_sum(
+        utils.matrix_transform(X, np.power, 2),
+        axis=0,
+        ignore_nan=ignore_nan
+    ) - X_colsums ** 2
+    var_y = N * utils.matrix_sum(
+        utils.matrix_transform(Y, np.power, 2),
+        axis=0,
+        ignore_nan=ignore_nan
+    ) - Y_colsums ** 2
     # Finally compute Pearson Correlation Coefficient as 2D array
     cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None])
     return cor.T

From 4a9159edf6431071fb766c6af7d1adde25d89ab6 Mon Sep 17 00:00:00 2001
From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com>
Date: Tue, 22 Feb 2022 11:13:55 -0500
Subject: [PATCH 03/10] Test `ignore_nan`

---
 test/test_stats.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test_stats.py b/test/test_stats.py
index 58455caf..4510a967 100644
--- a/test/test_stats.py
+++ b/test/test_stats.py
@@ -126,6 +126,15 @@ def test_fun(X, *args, **kwargs):
     )
 
 
+def test_pairwise_correlation_nan():
+    D = np.array([np.arange(10), np.arange(0, 20, 2)])
+    D[:,3] = np.nan
+    assert np.all(np.isnan(scprep.stats.pairwise_correlation(D, D)))
+    C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True)
+    assert not np.any(np.isnan(C))
+    np.testing.assert_equal(C, 1)
+
+
 def shan_entropy(c):
     c_normalized = c / float(np.sum(c))
     return stats.entropy(c_normalized[c_normalized != 0])

From 2b6bb3bdb5bad57b7208a023d9eb3e5ff80ef7c2 Mon Sep 17 00:00:00 2001
From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com>
Date: Tue, 22 Feb 2022 11:17:45 -0500
Subject: [PATCH 04/10] Trigger actions workflow

---
 test/test_stats.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_stats.py b/test/test_stats.py
index 4510a967..0622e0e8 100644
--- a/test/test_stats.py
+++ b/test/test_stats.py
@@ -129,7 +129,8 @@ def test_fun(X, *args, **kwargs):
 def test_pairwise_correlation_nan():
     D = np.array([np.arange(10), np.arange(0, 20, 2)])
     D[:,3] = np.nan
-    assert np.all(np.isnan(scprep.stats.pairwise_correlation(D, D)))
+    C = scprep.stats.pairwise_correlation(D, D)
+    assert np.all(np.isnan(C))
     C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True)
     assert not np.any(np.isnan(C))
     np.testing.assert_equal(C, 1)

From 8ada936518a0008b2e7fab565208d34f56979366 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 22 Feb 2022 16:19:07 +0000
Subject: [PATCH 05/10] pre-commit

---
 scprep/stats.py    | 24 ++++++++++++++----------
 test/test_stats.py |  2 +-
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/scprep/stats.py b/scprep/stats.py
index e2228088..c394eaad 100644
--- a/scprep/stats.py
+++ b/scprep/stats.py
@@ -86,16 +86,20 @@ def pairwise_correlation(X, Y, ignore_nan=False):
     # one-by-one
     N_times_sum_xy = utils.toarray(N * Y.T.dot(X))
     sum_x_times_sum_y = X_colsums * Y_colsums[:, None]
-    var_x = N * utils.matrix_sum(
-        utils.matrix_transform(X, np.power, 2),
-        axis=0,
-        ignore_nan=ignore_nan
-    ) - X_colsums ** 2
-    var_y = N * utils.matrix_sum(
-        utils.matrix_transform(Y, np.power, 2),
-        axis=0,
-        ignore_nan=ignore_nan
-    ) - Y_colsums ** 2
+    var_x = (
+        N
+        * utils.matrix_sum(
+            utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan
+        )
+        - X_colsums ** 2
+    )
+    var_y = (
+        N
+        * utils.matrix_sum(
+            utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan
+        )
+        - Y_colsums ** 2
+    )
     # Finally compute Pearson Correlation Coefficient as 2D array
     cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None])
     return cor.T
diff --git a/test/test_stats.py b/test/test_stats.py
index 0622e0e8..71f5d614 100644
--- a/test/test_stats.py
+++ b/test/test_stats.py
@@ -128,7 +128,7 @@ def test_fun(X, *args, **kwargs):
 
 def test_pairwise_correlation_nan():
     D = np.array([np.arange(10), np.arange(0, 20, 2)])
-    D[:,3] = np.nan
+    D[:, 3] = np.nan
     C = scprep.stats.pairwise_correlation(D, D)
     assert np.all(np.isnan(C))
     C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True)

From 24c4d94f693ce2c862727a64e093557a689390ac Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 22 Feb 2022 16:24:35 +0000
Subject: [PATCH 06/10] pre-commit

---
 scprep/normalize.py     | 2 +-
 scprep/plot/marker.py   | 4 ++--
 scprep/plot/scree.py    | 2 +-
 scprep/run/slingshot.py | 2 +-
 scprep/run/splatter.py  | 2 +-
 scprep/stats.py         | 2 +-
 scprep/utils.py         | 4 ++--
 test/test_filter.py     | 4 ++--
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/scprep/normalize.py b/scprep/normalize.py
index 90ee56e4..ce39a40c 100644
--- a/scprep/normalize.py
+++ b/scprep/normalize.py
@@ -76,7 +76,7 @@ def library_size_normalize(data, rescale=10000, return_library_size=False):
             # dense data
             data = data.to_numpy()
 
-    calc_libsize = sparse.issparse(data) and (return_library_size or data.nnz > 2**31)
+    calc_libsize = sparse.issparse(data) and (return_library_size or data.nnz > 2 ** 31)
     rescale, libsize = _get_scaled_libsize(data, rescale, calc_libsize)
 
     if libsize is not None:
diff --git a/scprep/plot/marker.py b/scprep/plot/marker.py
index ea38ea44..0ccbf6ad 100644
--- a/scprep/plot/marker.py
+++ b/scprep/plot/marker.py
@@ -79,7 +79,7 @@ def _cluster_tissues(tissue_names, cluster_names, tissue_labels, cluster_labels,
         tissue_features.append(np.concatenate(tissue_data))
     tissue_features = np.array(tissue_features)
     # normalize
-    tissue_features = tissue_features / np.sqrt(np.sum(tissue_features**2))
+    tissue_features = tissue_features / np.sqrt(np.sum(tissue_features ** 2))
     tissues_order = hierarchy.leaves_list(hierarchy.linkage(tissue_features))
     return tissues_order
 
@@ -103,7 +103,7 @@ def _cluster_markers(
                 marker_features.append(np.concatenate([s[marker_idx], c[marker_idx]]))
             marker_features = np.array(marker_features)
             # normalize
-            marker_features = marker_features / np.sqrt(np.sum(marker_features**2))
+            marker_features = marker_features / np.sqrt(np.sum(marker_features ** 2))
             marker_group_order = hierarchy.leaves_list(
                 hierarchy.linkage(marker_features)
             )
diff --git a/scprep/plot/scree.py b/scprep/plot/scree.py
index 339067ee..12c72358 100644
--- a/scprep/plot/scree.py
+++ b/scprep/plot/scree.py
@@ -63,7 +63,7 @@ def scree_plot(
     >>> scprep.plot.scree_plot(singular_values, cumulative=True)
     """
     with temp_fontsize(fontsize):
-        explained_variance = singular_values**2
+        explained_variance = singular_values ** 2
         explained_variance = explained_variance / explained_variance.sum() * 100
         if cumulative:
             explained_variance = np.cumsum(explained_variance)
diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py
index 08bd7e84..2ee8247c 100644
--- a/scprep/run/slingshot.py
+++ b/scprep/run/slingshot.py
@@ -212,7 +212,7 @@ def Slingshot(
     ...     ax.plot(curve[:,0], curve[:,1], c='black')
     """
     if seed is None:
-        seed = np.random.randint(2**16 - 1)
+        seed = np.random.randint(2 ** 16 - 1)
     if distance is not None:
         raise NotImplementedError("distance argument not currently implemented")
     np.random.seed(seed)
diff --git a/scprep/run/splatter.py b/scprep/run/splatter.py
index 9b81cf0c..e3137e5c 100644
--- a/scprep/run/splatter.py
+++ b/scprep/run/splatter.py
@@ -282,7 +282,7 @@ def SplatSimulate(
         dropout : Logical matrix showing which values have been dropped in which cells.
     """
     if seed is None:
-        seed = np.random.randint(2**16 - 1)
+        seed = np.random.randint(2 ** 16 - 1)
     if dropout_type == "binomial":
         dropout_type = "none"
     else:
diff --git a/scprep/stats.py b/scprep/stats.py
index 5305eb7e..c394eaad 100644
--- a/scprep/stats.py
+++ b/scprep/stats.py
@@ -455,7 +455,7 @@ def t_statistic(X, Y):
     X, Y = _preprocess_test_matrices(X, Y)
     X_std = utils.matrix_std(X, axis=0)
     Y_std = utils.matrix_std(Y, axis=0)
-    paired_std = np.sqrt(X_std**2 / X.shape[0] + Y_std**2 / Y.shape[0])
+    paired_std = np.sqrt(X_std ** 2 / X.shape[0] + Y_std ** 2 / Y.shape[0])
     return mean_difference(X, Y) / paired_std
 
 
diff --git a/scprep/utils.py b/scprep/utils.py
index 77e89630..18462ab2 100644
--- a/scprep/utils.py
+++ b/scprep/utils.py
@@ -459,7 +459,7 @@ def matrix_std(data, axis=None):
             if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)):
                 data = data.tocoo()
             data_sq = data.copy()
-            data_sq.data = data_sq.data**2
+            data_sq.data = data_sq.data ** 2
             variance = data_sq.mean() - data.mean() ** 2
             std = np.sqrt(variance)
         else:
@@ -475,7 +475,7 @@ def matrix_std(data, axis=None):
             for i in range(N):
                 col = next_fn(i)
                 col_sq = col.copy()
-                col_sq.data = col_sq.data**2
+                col_sq.data = col_sq.data ** 2
                 variance = col_sq.mean() - col.mean() ** 2
                 std.append(np.sqrt(variance))
             std = np.array(std)
diff --git a/test/test_filter.py b/test/test_filter.py
index 8bb95d6f..a251e778 100644
--- a/test/test_filter.py
+++ b/test/test_filter.py
@@ -407,13 +407,13 @@ def test_large_sparse_dataframe_library_size():
     if matrix._pandas_0:
         matrix._ignore_pandas_sparse_warning()
         X = pd.SparseDataFrame(
-            sparse.coo_matrix((10**7, 2 * 10**4)), default_fill_value=0.0
+            sparse.coo_matrix((10 ** 7, 2 * 10 ** 4)), default_fill_value=0.0
         )
         cell_sums = scprep.measure.library_size(X)
         assert cell_sums.shape[0] == X.shape[0]
         matrix._reset_warnings()
     X = matrix.SparseDataFrame(
-        sparse.coo_matrix((10**7, 2 * 10**4)), default_fill_value=0.0
+        sparse.coo_matrix((10 ** 7, 2 * 10 ** 4)), default_fill_value=0.0
     )
     cell_sums = scprep.measure.library_size(X)
     assert cell_sums.shape[0] == X.shape[0]

From 03b22d42f1d37706099394054c55ef76bd565d93 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 9 May 2022 18:29:48 +0000
Subject: [PATCH 07/10] pre-commit

---
 scprep/normalize.py     | 2 +-
 scprep/plot/marker.py   | 4 ++--
 scprep/plot/scree.py    | 2 +-
 scprep/run/slingshot.py | 2 +-
 scprep/run/splatter.py  | 2 +-
 scprep/stats.py         | 6 +++---
 scprep/utils.py         | 4 ++--
 test/test_filter.py     | 4 ++--
 8 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/scprep/normalize.py b/scprep/normalize.py
index ce39a40c..90ee56e4 100644
--- a/scprep/normalize.py
+++ b/scprep/normalize.py
@@ -76,7 +76,7 @@ def library_size_normalize(data, rescale=10000, return_library_size=False):
             # dense data
             data = data.to_numpy()
 
-    calc_libsize = sparse.issparse(data) and (return_library_size or data.nnz > 2 ** 31)
+    calc_libsize = sparse.issparse(data) and (return_library_size or data.nnz > 2**31)
     rescale, libsize = _get_scaled_libsize(data, rescale, calc_libsize)
 
     if libsize is not None:
diff --git a/scprep/plot/marker.py b/scprep/plot/marker.py
index 0ccbf6ad..ea38ea44 100644
--- a/scprep/plot/marker.py
+++ b/scprep/plot/marker.py
@@ -79,7 +79,7 @@ def _cluster_tissues(tissue_names, cluster_names, tissue_labels, cluster_labels,
         tissue_features.append(np.concatenate(tissue_data))
     tissue_features = np.array(tissue_features)
     # normalize
-    tissue_features = tissue_features / np.sqrt(np.sum(tissue_features ** 2))
+    tissue_features = tissue_features / np.sqrt(np.sum(tissue_features**2))
     tissues_order = hierarchy.leaves_list(hierarchy.linkage(tissue_features))
     return tissues_order
 
@@ -103,7 +103,7 @@ def _cluster_markers(
                 marker_features.append(np.concatenate([s[marker_idx], c[marker_idx]]))
             marker_features = np.array(marker_features)
             # normalize
-            marker_features = marker_features / np.sqrt(np.sum(marker_features ** 2))
+            marker_features = marker_features / np.sqrt(np.sum(marker_features**2))
             marker_group_order = hierarchy.leaves_list(
                 hierarchy.linkage(marker_features)
             )
diff --git a/scprep/plot/scree.py b/scprep/plot/scree.py
index 12c72358..339067ee 100644
--- a/scprep/plot/scree.py
+++ b/scprep/plot/scree.py
@@ -63,7 +63,7 @@ def scree_plot(
     >>> scprep.plot.scree_plot(singular_values, cumulative=True)
     """
     with temp_fontsize(fontsize):
-        explained_variance = singular_values ** 2
+        explained_variance = singular_values**2
         explained_variance = explained_variance / explained_variance.sum() * 100
         if cumulative:
             explained_variance = np.cumsum(explained_variance)
diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py
index 2ee8247c..08bd7e84 100644
--- a/scprep/run/slingshot.py
+++ b/scprep/run/slingshot.py
@@ -212,7 +212,7 @@ def Slingshot(
     ...     ax.plot(curve[:,0], curve[:,1], c='black')
     """
     if seed is None:
-        seed = np.random.randint(2 ** 16 - 1)
+        seed = np.random.randint(2**16 - 1)
     if distance is not None:
         raise NotImplementedError("distance argument not currently implemented")
     np.random.seed(seed)
diff --git a/scprep/run/splatter.py b/scprep/run/splatter.py
index e3137e5c..9b81cf0c 100644
--- a/scprep/run/splatter.py
+++ b/scprep/run/splatter.py
@@ -282,7 +282,7 @@ def SplatSimulate(
         dropout : Logical matrix showing which values have been dropped in which cells.
     """
     if seed is None:
-        seed = np.random.randint(2 ** 16 - 1)
+        seed = np.random.randint(2**16 - 1)
     if dropout_type == "binomial":
         dropout_type = "none"
     else:
diff --git a/scprep/stats.py b/scprep/stats.py
index c394eaad..a35ebae7 100644
--- a/scprep/stats.py
+++ b/scprep/stats.py
@@ -91,14 +91,14 @@ def pairwise_correlation(X, Y, ignore_nan=False):
         * utils.matrix_sum(
             utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan
         )
-        - X_colsums ** 2
+        - X_colsums**2
     )
     var_y = (
         N
         * utils.matrix_sum(
             utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan
         )
-        - Y_colsums ** 2
+        - Y_colsums**2
     )
     # Finally compute Pearson Correlation Coefficient as 2D array
     cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None])
@@ -455,7 +455,7 @@ def t_statistic(X, Y):
     X, Y = _preprocess_test_matrices(X, Y)
     X_std = utils.matrix_std(X, axis=0)
     Y_std = utils.matrix_std(Y, axis=0)
-    paired_std = np.sqrt(X_std ** 2 / X.shape[0] + Y_std ** 2 / Y.shape[0])
+    paired_std = np.sqrt(X_std**2 / X.shape[0] + Y_std**2 / Y.shape[0])
     return mean_difference(X, Y) / paired_std
 
 
diff --git a/scprep/utils.py b/scprep/utils.py
index 18462ab2..77e89630 100644
--- a/scprep/utils.py
+++ b/scprep/utils.py
@@ -459,7 +459,7 @@ def matrix_std(data, axis=None):
             if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)):
                 data = data.tocoo()
             data_sq = data.copy()
-            data_sq.data = data_sq.data ** 2
+            data_sq.data = data_sq.data**2
             variance = data_sq.mean() - data.mean() ** 2
             std = np.sqrt(variance)
         else:
@@ -475,7 +475,7 @@ def matrix_std(data, axis=None):
             for i in range(N):
                 col = next_fn(i)
                 col_sq = col.copy()
-                col_sq.data = col_sq.data ** 2
+                col_sq.data = col_sq.data**2
                 variance = col_sq.mean() - col.mean() ** 2
                 std.append(np.sqrt(variance))
             std = np.array(std)
diff --git a/test/test_filter.py b/test/test_filter.py
index a251e778..8bb95d6f 100644
--- a/test/test_filter.py
+++ b/test/test_filter.py
@@ -407,13 +407,13 @@ def test_large_sparse_dataframe_library_size():
     if matrix._pandas_0:
         matrix._ignore_pandas_sparse_warning()
         X = pd.SparseDataFrame(
-            sparse.coo_matrix((10 ** 7, 2 * 10 ** 4)), default_fill_value=0.0
+            sparse.coo_matrix((10**7, 2 * 10**4)), default_fill_value=0.0
         )
         cell_sums = scprep.measure.library_size(X)
         assert cell_sums.shape[0] == X.shape[0]
         matrix._reset_warnings()
     X = matrix.SparseDataFrame(
-        sparse.coo_matrix((10 ** 7, 2 * 10 ** 4)), default_fill_value=0.0
+        sparse.coo_matrix((10**7, 2 * 10**4)), default_fill_value=0.0
     )
     cell_sums = scprep.measure.library_size(X)
     assert cell_sums.shape[0] == X.shape[0]

From bb37a49e0480279f8c84d34282afe8aba8bdf6cd Mon Sep 17 00:00:00 2001
From: Scott Gigante <scott.gigante@immunai.com>
Date: Mon, 9 May 2022 16:54:30 -0400
Subject: [PATCH 08/10] test with float

---
 test/test_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_stats.py b/test/test_stats.py
index 71f5d614..5c6d0347 100644
--- a/test/test_stats.py
+++ b/test/test_stats.py
@@ -127,7 +127,7 @@ def test_fun(X, *args, **kwargs):
 
 
 def test_pairwise_correlation_nan():
-    D = np.array([np.arange(10), np.arange(0, 20, 2)])
+    D = np.array([np.arange(10), np.arange(0, 20, 2)]).astype(float)
     D[:, 3] = np.nan
     C = scprep.stats.pairwise_correlation(D, D)
     assert np.all(np.isnan(C))

From 0c34556169a54f404f62f5648dca01bc9438c841 Mon Sep 17 00:00:00 2001
From: Scott Gigante <scott.gigante@immunai.com>
Date: Wed, 11 May 2022 12:14:27 -0400
Subject: [PATCH 09/10] fix tests

---
 scprep/stats.py    | 32 +++++++++++++++++---------------
 scprep/utils.py    | 27 ++++++++++++++++++++++++++-
 test/test_stats.py | 36 ++++++++++++++++++++++++++++--------
 3 files changed, 71 insertions(+), 24 deletions(-)

diff --git a/scprep/stats.py b/scprep/stats.py
index a35ebae7..45807d5e 100644
--- a/scprep/stats.py
+++ b/scprep/stats.py
@@ -1,3 +1,5 @@
+import scipy.sparse
+
 from . import plot
 from . import select
 from . import utils
@@ -84,24 +86,24 @@ def pairwise_correlation(X, Y, ignore_nan=False):
     Y_colsums = utils.matrix_sum(Y, axis=0, ignore_nan=ignore_nan)
     # Basically there are four parts in the formula. We would compute them
     # one-by-one
-    N_times_sum_xy = utils.toarray(N * Y.T.dot(X))
-    sum_x_times_sum_y = X_colsums * Y_colsums[:, None]
-    var_x = (
-        N
-        * utils.matrix_sum(
-            utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan
-        )
-        - X_colsums**2
+    X_sq_colsums = utils.matrix_sum(
+        utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan
     )
-    var_y = (
-        N
-        * utils.matrix_sum(
-            utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan
-        )
-        - Y_colsums**2
+    Y_sq_colsums = utils.matrix_sum(
+        utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan
     )
+    var_x = N * X_sq_colsums - X_colsums**2
+    var_y = N * Y_sq_colsums - Y_colsums**2
+    if ignore_nan:
+        # now that we have the variance computed we can fill in the NaNs
+        X = utils.fillna(X, 0)
+        Y = utils.fillna(Y, 0)
+    N_times_sum_xy = utils.toarray(N * Y.T.dot(X))
+    sum_x_times_sum_y = X_colsums * Y_colsums[:, None]
     # Finally compute Pearson Correlation Coefficient as 2D array
-    cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None])
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message="invalid value encountered in true_divide", category=RuntimeWarning)
+        cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None])
     return cor.T
 
 
diff --git a/scprep/utils.py b/scprep/utils.py
index 77e89630..c9dd4b4b 100644
--- a/scprep/utils.py
+++ b/scprep/utils.py
@@ -378,6 +378,31 @@ def matrix_transform(data, fun, *args, **kwargs):
     return data
 
 
+def fillna(data, fill, copy=True):
+    return_cls = None
+    if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)):
+        return_cls = type(data)
+        assert copy, f"Cannot fillna in-place for {return_cls.__name__}"
+        data = data.tocsr()
+    elif copy:
+        data = data.copy()
+    if sparse.issparse(data):
+        data.data[np.isnan(data.data)] = fill
+        if return_cls is not None:
+            data = return_cls(data)
+    else:
+        data[np.isnan(data)] = fill
+    return data
+
+
+
+def _nansum(data, axis=None):
+    if sparse.issparse(data):
+        return np.sum(fillna(data, 0), axis=axis)
+    else:
+        return np.nansum(data, axis=axis)
+
+
 def matrix_sum(data, axis=None, ignore_nan=False):
     """Get the column-wise, row-wise, or total sum of values in a matrix.
 
@@ -396,7 +421,7 @@ def matrix_sum(data, axis=None, ignore_nan=False):
     sums : array-like or float
         Sums along desired axis.
     """
-    sum_fn = np.nansum if ignore_nan else np.sum
+    sum_fn = _nansum if ignore_nan else np.sum
     if axis not in [0, 1, None]:
         raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis))
     if isinstance(data, pd.DataFrame):
diff --git a/test/test_stats.py b/test/test_stats.py
index 5c6d0347..6b6ef4af 100644
--- a/test/test_stats.py
+++ b/test/test_stats.py
@@ -1,4 +1,6 @@
 from functools import partial
+
+import scipy.sparse
 from parameterized import parameterized
 from scipy import stats
 from tools import data
@@ -93,7 +95,7 @@ def test_fun(X, *args, **kwargs):
         )
 
     D = data.generate_positive_sparse_matrix(shape=(500, 100), seed=42, poisson_mean=5)
-    Y = test_fun(D)
+    Y = np.corrcoef(D.T)[:,:10]
     assert Y.shape == (D.shape[1], 10)
     assert np.allclose(Y[(np.arange(10), np.arange(10))], 1, atol=0)
     matrix.test_all_matrix_types(
@@ -127,13 +129,31 @@ def test_fun(X, *args, **kwargs):
 
 
 def test_pairwise_correlation_nan():
-    D = np.array([np.arange(10), np.arange(0, 20, 2)]).astype(float)
-    D[:, 3] = np.nan
-    C = scprep.stats.pairwise_correlation(D, D)
-    assert np.all(np.isnan(C))
-    C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True)
-    assert not np.any(np.isnan(C))
-    np.testing.assert_equal(C, 1)
+    D = np.array([np.arange(10), np.arange(0, 20, 2), np.zeros(10)]).astype(float).T
+    D[3,:] = np.nan
+
+    def test_with_nan(D):
+        C = scprep.stats.pairwise_correlation(D, D)
+        assert np.all(np.isnan(C))
+
+    matrix.test_all_matrix_types(
+        D,
+        test_with_nan,
+    )
+
+    def test_with_ignore_nan(D):
+        C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True)
+        # should still be NaN on samples that have no variance
+        assert np.all(np.isnan(C[-1]))
+        assert np.all(np.isnan(C[:,-1]))
+        # but shouldn't be NaN on samples that have some NaNs
+        assert not np.any(np.isnan(C[:2][:,:2]))
+        np.testing.assert_equal(C[:2][:,:2], 1)
+
+    matrix.test_all_matrix_types(
+        D,
+        test_with_ignore_nan,
+    )
 
 
 def shan_entropy(c):

From bae8a82f3af9f3c998e3a2a83d7a43cbc2e787e9 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 11 May 2022 16:15:46 +0000
Subject: [PATCH 10/10] pre-commit

---
 scprep/stats.py    |  9 ++++++---
 scprep/utils.py    |  1 -
 test/test_stats.py | 13 ++++++-------
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/scprep/stats.py b/scprep/stats.py
index 45807d5e..72a29c3e 100644
--- a/scprep/stats.py
+++ b/scprep/stats.py
@@ -1,5 +1,3 @@
-import scipy.sparse
-
 from . import plot
 from . import select
 from . import utils
@@ -13,6 +11,7 @@
 import numbers
 import numpy as np
 import pandas as pd
+import scipy.sparse
 import warnings
 
 plt = matplotlib.pyplot
@@ -102,7 +101,11 @@ def pairwise_correlation(X, Y, ignore_nan=False):
     sum_x_times_sum_y = X_colsums * Y_colsums[:, None]
     # Finally compute Pearson Correlation Coefficient as 2D array
     with warnings.catch_warnings():
-        warnings.filterwarnings("ignore", message="invalid value encountered in true_divide", category=RuntimeWarning)
+        warnings.filterwarnings(
+            "ignore",
+            message="invalid value encountered in true_divide",
+            category=RuntimeWarning,
+        )
         cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None])
     return cor.T
 
diff --git a/scprep/utils.py b/scprep/utils.py
index c9dd4b4b..9cb865e4 100644
--- a/scprep/utils.py
+++ b/scprep/utils.py
@@ -395,7 +395,6 @@ def fillna(data, fill, copy=True):
     return data
 
 
-
 def _nansum(data, axis=None):
     if sparse.issparse(data):
         return np.sum(fillna(data, 0), axis=axis)
diff --git a/test/test_stats.py b/test/test_stats.py
index 6b6ef4af..ae93a952 100644
--- a/test/test_stats.py
+++ b/test/test_stats.py
@@ -1,6 +1,4 @@
 from functools import partial
-
-import scipy.sparse
 from parameterized import parameterized
 from scipy import stats
 from tools import data
@@ -9,6 +7,7 @@
 
 import numpy as np
 import os
+import scipy.sparse
 import scprep
 import warnings
 
@@ -95,7 +94,7 @@ def test_fun(X, *args, **kwargs):
         )
 
     D = data.generate_positive_sparse_matrix(shape=(500, 100), seed=42, poisson_mean=5)
-    Y = np.corrcoef(D.T)[:,:10]
+    Y = np.corrcoef(D.T)[:, :10]
     assert Y.shape == (D.shape[1], 10)
     assert np.allclose(Y[(np.arange(10), np.arange(10))], 1, atol=0)
     matrix.test_all_matrix_types(
@@ -130,7 +129,7 @@ def test_fun(X, *args, **kwargs):
 
 def test_pairwise_correlation_nan():
     D = np.array([np.arange(10), np.arange(0, 20, 2), np.zeros(10)]).astype(float).T
-    D[3,:] = np.nan
+    D[3, :] = np.nan
 
     def test_with_nan(D):
         C = scprep.stats.pairwise_correlation(D, D)
@@ -145,10 +144,10 @@ def test_with_ignore_nan(D):
         C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True)
         # should still be NaN on samples that have no variance
         assert np.all(np.isnan(C[-1]))
-        assert np.all(np.isnan(C[:,-1]))
+        assert np.all(np.isnan(C[:, -1]))
         # but shouldn't be NaN on samples that have some NaNs
-        assert not np.any(np.isnan(C[:2][:,:2]))
-        np.testing.assert_equal(C[:2][:,:2], 1)
+        assert not np.any(np.isnan(C[:2][:, :2]))
+        np.testing.assert_equal(C[:2][:, :2], 1)
 
     matrix.test_all_matrix_types(
         D,