From 9409a3023bb3643ccbbe5804e559c1b8d32774cd Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 22 Feb 2022 11:05:54 -0500 Subject: [PATCH 01/10] add `ignore_nan` arg to `utils.matrix_transform` --- scprep/utils.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/scprep/utils.py b/scprep/utils.py index 23a174b4..18462ab2 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -378,7 +378,7 @@ def matrix_transform(data, fun, *args, **kwargs): return data -def matrix_sum(data, axis=None): +def matrix_sum(data, axis=None, ignore_nan=False): """Get the column-wise, row-wise, or total sum of values in a matrix. Parameters @@ -388,37 +388,40 @@ def matrix_sum(data, axis=None): axis : int or None, optional (default: None) Axis across which to sum. axis=0 gives column sums, axis=1 gives row sums. None gives the total sum. + ignore_nan : bool, optional (default: False) + If True, uses `np.nansum` instead of `np.sum` Returns ------- sums : array-like or float Sums along desired axis. """ + sum_fn = np.nansum if ignore_nan else np.sum if axis not in [0, 1, None]: raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis)) if isinstance(data, pd.DataFrame): if is_SparseDataFrame(data): if axis is None: - sums = data.to_coo().sum() + sums = sum_fn(data.to_coo()) else: index = data.index if axis == 1 else data.columns sums = pd.Series( - np.array(data.to_coo().sum(axis)).flatten(), index=index + np.array(sum_fn(data.to_coo(), axis)).flatten(), index=index ) elif is_sparse_dataframe(data): if axis is None: - sums = data.sparse.to_coo().sum() + sums = sum_fn(data.sparse.to_coo()) else: index = data.index if axis == 1 else data.columns sums = pd.Series( - np.array(data.sparse.to_coo().sum(axis)).flatten(), index=index + np.array(sum_fn(data.sparse.to_coo(), axis)).flatten(), index=index ) elif axis is None: - sums = data.to_numpy().sum() + sums = sum_fn(data.to_numpy()) else: - sums = data.sum(axis) + sums = sum_fn(data, axis) else: - sums = np.sum(data, axis=axis) + sums = sum_fn(data, axis=axis) if isinstance(sums, np.matrix): sums = np.array(sums).flatten() return sums From 8245536f01ea58a3e922220cdbb4c850eec73f5b Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 22 Feb 2022 11:08:43 -0500 Subject: [PATCH 02/10] Add `ignore_nan` to `pairwise_correlation` --- scprep/stats.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/scprep/stats.py b/scprep/stats.py index 7f4f4faa..e2228088 100644 --- a/scprep/stats.py +++ b/scprep/stats.py @@ -50,7 +50,7 @@ def EMD(x, y): return stats.wasserstein_distance(x, y) -def pairwise_correlation(X, Y): +def pairwise_correlation(X, Y, ignore_nan=False): """Compute pairwise Pearson correlation between columns of two matrices. From https://stackoverflow.com/a/33651442/3996580 @@ -61,6 +61,8 @@ def pairwise_correlation(X, Y): Input data Y : array-like, shape=[n_samples, p_features] Input data + ignore_nan : bool, optional (default: False) + If True, ignore NaNs, computing correlation over remaining values Returns ------- @@ -78,18 +80,22 @@ def pairwise_correlation(X, Y): if sparse.issparse(Y) and not sparse.issparse(X): X = sparse.csr_matrix(X) # Store columnw-wise in X and Y, as they would be used at few places - X_colsums = utils.matrix_sum(X, axis=0) - Y_colsums = utils.matrix_sum(Y, axis=0) + X_colsums = utils.matrix_sum(X, axis=0, ignore_nan=ignore_nan) + Y_colsums = utils.matrix_sum(Y, axis=0, ignore_nan=ignore_nan) # Basically there are four parts in the formula. We would compute them # one-by-one N_times_sum_xy = utils.toarray(N * Y.T.dot(X)) sum_x_times_sum_y = X_colsums * Y_colsums[:, None] - var_x = N * utils.matrix_sum(utils.matrix_transform(X, np.power, 2), axis=0) - ( - X_colsums ** 2 - ) - var_y = N * utils.matrix_sum(utils.matrix_transform(Y, np.power, 2), axis=0) - ( - Y_colsums ** 2 - ) + var_x = N * utils.matrix_sum( + utils.matrix_transform(X, np.power, 2), + axis=0, + ignore_nan=ignore_nan + ) - X_colsums ** 2 + var_y = N * utils.matrix_sum( + utils.matrix_transform(Y, np.power, 2), + axis=0, + ignore_nan=ignore_nan + ) - Y_colsums ** 2 # Finally compute Pearson Correlation Coefficient as 2D array cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None]) return cor.T From 4a9159edf6431071fb766c6af7d1adde25d89ab6 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 22 Feb 2022 11:13:55 -0500 Subject: [PATCH 03/10] Test `ignore_nan` --- test/test_stats.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/test_stats.py b/test/test_stats.py index 58455caf..4510a967 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -126,6 +126,15 @@ def test_fun(X, *args, **kwargs): ) +def test_pairwise_correlation_nan(): + D = np.array([np.arange(10), np.arange(0, 20, 2)]) + D[:,3] = np.nan + assert np.all(np.isnan(scprep.stats.pairwise_correlation(D, D))) + C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True) + assert not np.any(np.isnan(C)) + np.testing.assert_equal(C, 1) + + def shan_entropy(c): c_normalized = c / float(np.sum(c)) return stats.entropy(c_normalized[c_normalized != 0]) From 2b6bb3bdb5bad57b7208a023d9eb3e5ff80ef7c2 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 22 Feb 2022 11:17:45 -0500 Subject: [PATCH 04/10] Trigger actions workflow --- test/test_stats.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_stats.py b/test/test_stats.py index 4510a967..0622e0e8 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -129,7 +129,8 @@ def test_fun(X, *args, **kwargs): def test_pairwise_correlation_nan(): D = np.array([np.arange(10), np.arange(0, 20, 2)]) D[:,3] = np.nan - assert np.all(np.isnan(scprep.stats.pairwise_correlation(D, D))) + C = scprep.stats.pairwise_correlation(D, D) + assert np.all(np.isnan(C)) C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True) assert not np.any(np.isnan(C)) np.testing.assert_equal(C, 1) From 8ada936518a0008b2e7fab565208d34f56979366 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 22 Feb 2022 16:19:07 +0000 Subject: [PATCH 05/10] pre-commit --- scprep/stats.py | 24 ++++++++++++++---------- test/test_stats.py | 2 +- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/scprep/stats.py b/scprep/stats.py index e2228088..c394eaad 100644 --- a/scprep/stats.py +++ b/scprep/stats.py @@ -86,16 +86,20 @@ def pairwise_correlation(X, Y, ignore_nan=False): # one-by-one N_times_sum_xy = utils.toarray(N * Y.T.dot(X)) sum_x_times_sum_y = X_colsums * Y_colsums[:, None] - var_x = N * utils.matrix_sum( - utils.matrix_transform(X, np.power, 2), - axis=0, - ignore_nan=ignore_nan - ) - X_colsums ** 2 - var_y = N * utils.matrix_sum( - utils.matrix_transform(Y, np.power, 2), - axis=0, - ignore_nan=ignore_nan - ) - Y_colsums ** 2 + var_x = ( + N + * utils.matrix_sum( + utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan + ) + - X_colsums ** 2 + ) + var_y = ( + N + * utils.matrix_sum( + utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan + ) + - Y_colsums ** 2 + ) # Finally compute Pearson Correlation Coefficient as 2D array cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None]) return cor.T diff --git a/test/test_stats.py b/test/test_stats.py index 0622e0e8..71f5d614 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -128,7 +128,7 @@ def test_fun(X, *args, **kwargs): def test_pairwise_correlation_nan(): D = np.array([np.arange(10), np.arange(0, 20, 2)]) - D[:,3] = np.nan + D[:, 3] = np.nan C = scprep.stats.pairwise_correlation(D, D) assert np.all(np.isnan(C)) C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True) From 24c4d94f693ce2c862727a64e093557a689390ac Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 22 Feb 2022 16:24:35 +0000 Subject: [PATCH 06/10] pre-commit --- scprep/normalize.py | 2 +- scprep/plot/marker.py | 4 ++-- scprep/plot/scree.py | 2 +- scprep/run/slingshot.py | 2 +- scprep/run/splatter.py | 2 +- scprep/stats.py | 2 +- scprep/utils.py | 4 ++-- test/test_filter.py | 4 ++-- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/scprep/normalize.py b/scprep/normalize.py index 90ee56e4..ce39a40c 100644 --- a/scprep/normalize.py +++ b/scprep/normalize.py @@ -76,7 +76,7 @@ def library_size_normalize(data, rescale=10000, return_library_size=False): # dense data data = data.to_numpy() - calc_libsize = sparse.issparse(data) and (return_library_size or data.nnz > 2**31) + calc_libsize = sparse.issparse(data) and (return_library_size or data.nnz > 2 ** 31) rescale, libsize = _get_scaled_libsize(data, rescale, calc_libsize) if libsize is not None: diff --git a/scprep/plot/marker.py b/scprep/plot/marker.py index ea38ea44..0ccbf6ad 100644 --- a/scprep/plot/marker.py +++ b/scprep/plot/marker.py @@ -79,7 +79,7 @@ def _cluster_tissues(tissue_names, cluster_names, tissue_labels, cluster_labels, tissue_features.append(np.concatenate(tissue_data)) tissue_features = np.array(tissue_features) # normalize - tissue_features = tissue_features / np.sqrt(np.sum(tissue_features**2)) + tissue_features = tissue_features / np.sqrt(np.sum(tissue_features ** 2)) tissues_order = hierarchy.leaves_list(hierarchy.linkage(tissue_features)) return tissues_order @@ -103,7 +103,7 @@ def _cluster_markers( marker_features.append(np.concatenate([s[marker_idx], c[marker_idx]])) marker_features = np.array(marker_features) # normalize - marker_features = marker_features / np.sqrt(np.sum(marker_features**2)) + marker_features = marker_features / np.sqrt(np.sum(marker_features ** 2)) marker_group_order = hierarchy.leaves_list( hierarchy.linkage(marker_features) ) diff --git a/scprep/plot/scree.py b/scprep/plot/scree.py index 339067ee..12c72358 100644 --- a/scprep/plot/scree.py +++ b/scprep/plot/scree.py @@ -63,7 +63,7 @@ def scree_plot( >>> scprep.plot.scree_plot(singular_values, cumulative=True) """ with temp_fontsize(fontsize): - explained_variance = singular_values**2 + explained_variance = singular_values ** 2 explained_variance = explained_variance / explained_variance.sum() * 100 if cumulative: explained_variance = np.cumsum(explained_variance) diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py index 08bd7e84..2ee8247c 100644 --- a/scprep/run/slingshot.py +++ b/scprep/run/slingshot.py @@ -212,7 +212,7 @@ def Slingshot( ... ax.plot(curve[:,0], curve[:,1], c='black') """ if seed is None: - seed = np.random.randint(2**16 - 1) + seed = np.random.randint(2 ** 16 - 1) if distance is not None: raise NotImplementedError("distance argument not currently implemented") np.random.seed(seed) diff --git a/scprep/run/splatter.py b/scprep/run/splatter.py index 9b81cf0c..e3137e5c 100644 --- a/scprep/run/splatter.py +++ b/scprep/run/splatter.py @@ -282,7 +282,7 @@ def SplatSimulate( dropout : Logical matrix showing which values have been dropped in which cells. """ if seed is None: - seed = np.random.randint(2**16 - 1) + seed = np.random.randint(2 ** 16 - 1) if dropout_type == "binomial": dropout_type = "none" else: diff --git a/scprep/stats.py b/scprep/stats.py index 5305eb7e..c394eaad 100644 --- a/scprep/stats.py +++ b/scprep/stats.py @@ -455,7 +455,7 @@ def t_statistic(X, Y): X, Y = _preprocess_test_matrices(X, Y) X_std = utils.matrix_std(X, axis=0) Y_std = utils.matrix_std(Y, axis=0) - paired_std = np.sqrt(X_std**2 / X.shape[0] + Y_std**2 / Y.shape[0]) + paired_std = np.sqrt(X_std ** 2 / X.shape[0] + Y_std ** 2 / Y.shape[0]) return mean_difference(X, Y) / paired_std diff --git a/scprep/utils.py b/scprep/utils.py index 77e89630..18462ab2 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -459,7 +459,7 @@ def matrix_std(data, axis=None): if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)): data = data.tocoo() data_sq = data.copy() - data_sq.data = data_sq.data**2 + data_sq.data = data_sq.data ** 2 variance = data_sq.mean() - data.mean() ** 2 std = np.sqrt(variance) else: @@ -475,7 +475,7 @@ def matrix_std(data, axis=None): for i in range(N): col = next_fn(i) col_sq = col.copy() - col_sq.data = col_sq.data**2 + col_sq.data = col_sq.data ** 2 variance = col_sq.mean() - col.mean() ** 2 std.append(np.sqrt(variance)) std = np.array(std) diff --git a/test/test_filter.py b/test/test_filter.py index 8bb95d6f..a251e778 100644 --- a/test/test_filter.py +++ b/test/test_filter.py @@ -407,13 +407,13 @@ def test_large_sparse_dataframe_library_size(): if matrix._pandas_0: matrix._ignore_pandas_sparse_warning() X = pd.SparseDataFrame( - sparse.coo_matrix((10**7, 2 * 10**4)), default_fill_value=0.0 + sparse.coo_matrix((10 ** 7, 2 * 10 ** 4)), default_fill_value=0.0 ) cell_sums = scprep.measure.library_size(X) assert cell_sums.shape[0] == X.shape[0] matrix._reset_warnings() X = matrix.SparseDataFrame( - sparse.coo_matrix((10**7, 2 * 10**4)), default_fill_value=0.0 + sparse.coo_matrix((10 ** 7, 2 * 10 ** 4)), default_fill_value=0.0 ) cell_sums = scprep.measure.library_size(X) assert cell_sums.shape[0] == X.shape[0] From 03b22d42f1d37706099394054c55ef76bd565d93 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 9 May 2022 18:29:48 +0000 Subject: [PATCH 07/10] pre-commit --- scprep/normalize.py | 2 +- scprep/plot/marker.py | 4 ++-- scprep/plot/scree.py | 2 +- scprep/run/slingshot.py | 2 +- scprep/run/splatter.py | 2 +- scprep/stats.py | 6 +++--- scprep/utils.py | 4 ++-- test/test_filter.py | 4 ++-- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/scprep/normalize.py b/scprep/normalize.py index ce39a40c..90ee56e4 100644 --- a/scprep/normalize.py +++ b/scprep/normalize.py @@ -76,7 +76,7 @@ def library_size_normalize(data, rescale=10000, return_library_size=False): # dense data data = data.to_numpy() - calc_libsize = sparse.issparse(data) and (return_library_size or data.nnz > 2 ** 31) + calc_libsize = sparse.issparse(data) and (return_library_size or data.nnz > 2**31) rescale, libsize = _get_scaled_libsize(data, rescale, calc_libsize) if libsize is not None: diff --git a/scprep/plot/marker.py b/scprep/plot/marker.py index 0ccbf6ad..ea38ea44 100644 --- a/scprep/plot/marker.py +++ b/scprep/plot/marker.py @@ -79,7 +79,7 @@ def _cluster_tissues(tissue_names, cluster_names, tissue_labels, cluster_labels, tissue_features.append(np.concatenate(tissue_data)) tissue_features = np.array(tissue_features) # normalize - tissue_features = tissue_features / np.sqrt(np.sum(tissue_features ** 2)) + tissue_features = tissue_features / np.sqrt(np.sum(tissue_features**2)) tissues_order = hierarchy.leaves_list(hierarchy.linkage(tissue_features)) return tissues_order @@ -103,7 +103,7 @@ def _cluster_markers( marker_features.append(np.concatenate([s[marker_idx], c[marker_idx]])) marker_features = np.array(marker_features) # normalize - marker_features = marker_features / np.sqrt(np.sum(marker_features ** 2)) + marker_features = marker_features / np.sqrt(np.sum(marker_features**2)) marker_group_order = hierarchy.leaves_list( hierarchy.linkage(marker_features) ) diff --git a/scprep/plot/scree.py b/scprep/plot/scree.py index 12c72358..339067ee 100644 --- a/scprep/plot/scree.py +++ b/scprep/plot/scree.py @@ -63,7 +63,7 @@ def scree_plot( >>> scprep.plot.scree_plot(singular_values, cumulative=True) """ with temp_fontsize(fontsize): - explained_variance = singular_values ** 2 + explained_variance = singular_values**2 explained_variance = explained_variance / explained_variance.sum() * 100 if cumulative: explained_variance = np.cumsum(explained_variance) diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py index 2ee8247c..08bd7e84 100644 --- a/scprep/run/slingshot.py +++ b/scprep/run/slingshot.py @@ -212,7 +212,7 @@ def Slingshot( ... ax.plot(curve[:,0], curve[:,1], c='black') """ if seed is None: - seed = np.random.randint(2 ** 16 - 1) + seed = np.random.randint(2**16 - 1) if distance is not None: raise NotImplementedError("distance argument not currently implemented") np.random.seed(seed) diff --git a/scprep/run/splatter.py b/scprep/run/splatter.py index e3137e5c..9b81cf0c 100644 --- a/scprep/run/splatter.py +++ b/scprep/run/splatter.py @@ -282,7 +282,7 @@ def SplatSimulate( dropout : Logical matrix showing which values have been dropped in which cells. """ if seed is None: - seed = np.random.randint(2 ** 16 - 1) + seed = np.random.randint(2**16 - 1) if dropout_type == "binomial": dropout_type = "none" else: diff --git a/scprep/stats.py b/scprep/stats.py index c394eaad..a35ebae7 100644 --- a/scprep/stats.py +++ b/scprep/stats.py @@ -91,14 +91,14 @@ def pairwise_correlation(X, Y, ignore_nan=False): * utils.matrix_sum( utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan ) - - X_colsums ** 2 + - X_colsums**2 ) var_y = ( N * utils.matrix_sum( utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan ) - - Y_colsums ** 2 + - Y_colsums**2 ) # Finally compute Pearson Correlation Coefficient as 2D array cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None]) @@ -455,7 +455,7 @@ def t_statistic(X, Y): X, Y = _preprocess_test_matrices(X, Y) X_std = utils.matrix_std(X, axis=0) Y_std = utils.matrix_std(Y, axis=0) - paired_std = np.sqrt(X_std ** 2 / X.shape[0] + Y_std ** 2 / Y.shape[0]) + paired_std = np.sqrt(X_std**2 / X.shape[0] + Y_std**2 / Y.shape[0]) return mean_difference(X, Y) / paired_std diff --git a/scprep/utils.py b/scprep/utils.py index 18462ab2..77e89630 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -459,7 +459,7 @@ def matrix_std(data, axis=None): if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)): data = data.tocoo() data_sq = data.copy() - data_sq.data = data_sq.data ** 2 + data_sq.data = data_sq.data**2 variance = data_sq.mean() - data.mean() ** 2 std = np.sqrt(variance) else: @@ -475,7 +475,7 @@ def matrix_std(data, axis=None): for i in range(N): col = next_fn(i) col_sq = col.copy() - col_sq.data = col_sq.data ** 2 + col_sq.data = col_sq.data**2 variance = col_sq.mean() - col.mean() ** 2 std.append(np.sqrt(variance)) std = np.array(std) diff --git a/test/test_filter.py b/test/test_filter.py index a251e778..8bb95d6f 100644 --- a/test/test_filter.py +++ b/test/test_filter.py @@ -407,13 +407,13 @@ def test_large_sparse_dataframe_library_size(): if matrix._pandas_0: matrix._ignore_pandas_sparse_warning() X = pd.SparseDataFrame( - sparse.coo_matrix((10 ** 7, 2 * 10 ** 4)), default_fill_value=0.0 + sparse.coo_matrix((10**7, 2 * 10**4)), default_fill_value=0.0 ) cell_sums = scprep.measure.library_size(X) assert cell_sums.shape[0] == X.shape[0] matrix._reset_warnings() X = matrix.SparseDataFrame( - sparse.coo_matrix((10 ** 7, 2 * 10 ** 4)), default_fill_value=0.0 + sparse.coo_matrix((10**7, 2 * 10**4)), default_fill_value=0.0 ) cell_sums = scprep.measure.library_size(X) assert cell_sums.shape[0] == X.shape[0] From bb37a49e0480279f8c84d34282afe8aba8bdf6cd Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 9 May 2022 16:54:30 -0400 Subject: [PATCH 08/10] test with float --- test/test_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_stats.py b/test/test_stats.py index 71f5d614..5c6d0347 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -127,7 +127,7 @@ def test_fun(X, *args, **kwargs): def test_pairwise_correlation_nan(): - D = np.array([np.arange(10), np.arange(0, 20, 2)]) + D = np.array([np.arange(10), np.arange(0, 20, 2)]).astype(float) D[:, 3] = np.nan C = scprep.stats.pairwise_correlation(D, D) assert np.all(np.isnan(C)) From 0c34556169a54f404f62f5648dca01bc9438c841 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 11 May 2022 12:14:27 -0400 Subject: [PATCH 09/10] fix tests --- scprep/stats.py | 32 +++++++++++++++++--------------- scprep/utils.py | 27 ++++++++++++++++++++++++++- test/test_stats.py | 36 ++++++++++++++++++++++++++++-------- 3 files changed, 71 insertions(+), 24 deletions(-) diff --git a/scprep/stats.py b/scprep/stats.py index a35ebae7..45807d5e 100644 --- a/scprep/stats.py +++ b/scprep/stats.py @@ -1,3 +1,5 @@ +import scipy.sparse + from . import plot from . import select from . import utils @@ -84,24 +86,24 @@ def pairwise_correlation(X, Y, ignore_nan=False): Y_colsums = utils.matrix_sum(Y, axis=0, ignore_nan=ignore_nan) # Basically there are four parts in the formula. We would compute them # one-by-one - N_times_sum_xy = utils.toarray(N * Y.T.dot(X)) - sum_x_times_sum_y = X_colsums * Y_colsums[:, None] - var_x = ( - N - * utils.matrix_sum( - utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan - ) - - X_colsums**2 + X_sq_colsums = utils.matrix_sum( + utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan ) - var_y = ( - N - * utils.matrix_sum( - utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan - ) - - Y_colsums**2 + Y_sq_colsums = utils.matrix_sum( + utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan ) + var_x = N * X_sq_colsums - X_colsums**2 + var_y = N * Y_sq_colsums - Y_colsums**2 + if ignore_nan: + # now that we have the variance computed we can fill in the NaNs + X = utils.fillna(X, 0) + Y = utils.fillna(Y, 0) + N_times_sum_xy = utils.toarray(N * Y.T.dot(X)) + sum_x_times_sum_y = X_colsums * Y_colsums[:, None] # Finally compute Pearson Correlation Coefficient as 2D array - cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None]) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="invalid value encountered in true_divide", category=RuntimeWarning) + cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None]) return cor.T diff --git a/scprep/utils.py b/scprep/utils.py index 77e89630..c9dd4b4b 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -378,6 +378,31 @@ def matrix_transform(data, fun, *args, **kwargs): return data +def fillna(data, fill, copy=True): + return_cls = None + if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)): + return_cls = type(data) + assert copy, f"Cannot fillna in-place for {return_cls.__name__}" + data = data.tocsr() + elif copy: + data = data.copy() + if sparse.issparse(data): + data.data[np.isnan(data.data)] = fill + if return_cls is not None: + data = return_cls(data) + else: + data[np.isnan(data)] = fill + return data + + + +def _nansum(data, axis=None): + if sparse.issparse(data): + return np.sum(fillna(data, 0), axis=axis) + else: + return np.nansum(data, axis=axis) + + def matrix_sum(data, axis=None, ignore_nan=False): """Get the column-wise, row-wise, or total sum of values in a matrix. @@ -396,7 +421,7 @@ def matrix_sum(data, axis=None, ignore_nan=False): sums : array-like or float Sums along desired axis. """ - sum_fn = np.nansum if ignore_nan else np.sum + sum_fn = _nansum if ignore_nan else np.sum if axis not in [0, 1, None]: raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis)) if isinstance(data, pd.DataFrame): diff --git a/test/test_stats.py b/test/test_stats.py index 5c6d0347..6b6ef4af 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -1,4 +1,6 @@ from functools import partial + +import scipy.sparse from parameterized import parameterized from scipy import stats from tools import data @@ -93,7 +95,7 @@ def test_fun(X, *args, **kwargs): ) D = data.generate_positive_sparse_matrix(shape=(500, 100), seed=42, poisson_mean=5) - Y = test_fun(D) + Y = np.corrcoef(D.T)[:,:10] assert Y.shape == (D.shape[1], 10) assert np.allclose(Y[(np.arange(10), np.arange(10))], 1, atol=0) matrix.test_all_matrix_types( @@ -127,13 +129,31 @@ def test_fun(X, *args, **kwargs): def test_pairwise_correlation_nan(): - D = np.array([np.arange(10), np.arange(0, 20, 2)]).astype(float) - D[:, 3] = np.nan - C = scprep.stats.pairwise_correlation(D, D) - assert np.all(np.isnan(C)) - C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True) - assert not np.any(np.isnan(C)) - np.testing.assert_equal(C, 1) + D = np.array([np.arange(10), np.arange(0, 20, 2), np.zeros(10)]).astype(float).T + D[3,:] = np.nan + + def test_with_nan(D): + C = scprep.stats.pairwise_correlation(D, D) + assert np.all(np.isnan(C)) + + matrix.test_all_matrix_types( + D, + test_with_nan, + ) + + def test_with_ignore_nan(D): + C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True) + # should still be NaN on samples that have no variance + assert np.all(np.isnan(C[-1])) + assert np.all(np.isnan(C[:,-1])) + # but shouldn't be NaN on samples that have some NaNs + assert not np.any(np.isnan(C[:2][:,:2])) + np.testing.assert_equal(C[:2][:,:2], 1) + + matrix.test_all_matrix_types( + D, + test_with_ignore_nan, + ) def shan_entropy(c): From bae8a82f3af9f3c998e3a2a83d7a43cbc2e787e9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 11 May 2022 16:15:46 +0000 Subject: [PATCH 10/10] pre-commit --- scprep/stats.py | 9 ++++++--- scprep/utils.py | 1 - test/test_stats.py | 13 ++++++------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/scprep/stats.py b/scprep/stats.py index 45807d5e..72a29c3e 100644 --- a/scprep/stats.py +++ b/scprep/stats.py @@ -1,5 +1,3 @@ -import scipy.sparse - from . import plot from . import select from . import utils @@ -13,6 +11,7 @@ import numbers import numpy as np import pandas as pd +import scipy.sparse import warnings plt = matplotlib.pyplot @@ -102,7 +101,11 @@ def pairwise_correlation(X, Y, ignore_nan=False): sum_x_times_sum_y = X_colsums * Y_colsums[:, None] # Finally compute Pearson Correlation Coefficient as 2D array with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="invalid value encountered in true_divide", category=RuntimeWarning) + warnings.filterwarnings( + "ignore", + message="invalid value encountered in true_divide", + category=RuntimeWarning, + ) cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None]) return cor.T diff --git a/scprep/utils.py b/scprep/utils.py index c9dd4b4b..9cb865e4 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -395,7 +395,6 @@ def fillna(data, fill, copy=True): return data - def _nansum(data, axis=None): if sparse.issparse(data): return np.sum(fillna(data, 0), axis=axis) diff --git a/test/test_stats.py b/test/test_stats.py index 6b6ef4af..ae93a952 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -1,6 +1,4 @@ from functools import partial - -import scipy.sparse from parameterized import parameterized from scipy import stats from tools import data @@ -9,6 +7,7 @@ import numpy as np import os +import scipy.sparse import scprep import warnings @@ -95,7 +94,7 @@ def test_fun(X, *args, **kwargs): ) D = data.generate_positive_sparse_matrix(shape=(500, 100), seed=42, poisson_mean=5) - Y = np.corrcoef(D.T)[:,:10] + Y = np.corrcoef(D.T)[:, :10] assert Y.shape == (D.shape[1], 10) assert np.allclose(Y[(np.arange(10), np.arange(10))], 1, atol=0) matrix.test_all_matrix_types( @@ -130,7 +129,7 @@ def test_fun(X, *args, **kwargs): def test_pairwise_correlation_nan(): D = np.array([np.arange(10), np.arange(0, 20, 2), np.zeros(10)]).astype(float).T - D[3,:] = np.nan + D[3, :] = np.nan def test_with_nan(D): C = scprep.stats.pairwise_correlation(D, D) @@ -145,10 +144,10 @@ def test_with_ignore_nan(D): C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True) # should still be NaN on samples that have no variance assert np.all(np.isnan(C[-1])) - assert np.all(np.isnan(C[:,-1])) + assert np.all(np.isnan(C[:, -1])) # but shouldn't be NaN on samples that have some NaNs - assert not np.any(np.isnan(C[:2][:,:2])) - np.testing.assert_equal(C[:2][:,:2], 1) + assert not np.any(np.isnan(C[:2][:, :2])) + np.testing.assert_equal(C[:2][:, :2], 1) matrix.test_all_matrix_types( D,