Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ignore_nan arg to scprep.stats.pairwise_correlation #123

Merged
merged 13 commits into from
May 11, 2022
35 changes: 25 additions & 10 deletions scprep/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numbers
import numpy as np
import pandas as pd
import scipy.sparse
import warnings

plt = matplotlib.pyplot
Expand Down Expand Up @@ -50,7 +51,7 @@ def EMD(x, y):
return stats.wasserstein_distance(x, y)


def pairwise_correlation(X, Y):
def pairwise_correlation(X, Y, ignore_nan=False):
"""Compute pairwise Pearson correlation between columns of two matrices.

From https://stackoverflow.com/a/33651442/3996580
Expand All @@ -61,6 +62,8 @@ def pairwise_correlation(X, Y):
Input data
Y : array-like, shape=[n_samples, p_features]
Input data
ignore_nan : bool, optional (default: False)
If True, ignore NaNs, computing correlation over remaining values

Returns
-------
Expand All @@ -78,20 +81,32 @@ def pairwise_correlation(X, Y):
if sparse.issparse(Y) and not sparse.issparse(X):
X = sparse.csr_matrix(X)
# Store columnw-wise in X and Y, as they would be used at few places
X_colsums = utils.matrix_sum(X, axis=0)
Y_colsums = utils.matrix_sum(Y, axis=0)
X_colsums = utils.matrix_sum(X, axis=0, ignore_nan=ignore_nan)
Y_colsums = utils.matrix_sum(Y, axis=0, ignore_nan=ignore_nan)
# Basically there are four parts in the formula. We would compute them
# one-by-one
N_times_sum_xy = utils.toarray(N * Y.T.dot(X))
sum_x_times_sum_y = X_colsums * Y_colsums[:, None]
var_x = N * utils.matrix_sum(utils.matrix_transform(X, np.power, 2), axis=0) - (
X_colsums**2
X_sq_colsums = utils.matrix_sum(
utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan
)
var_y = N * utils.matrix_sum(utils.matrix_transform(Y, np.power, 2), axis=0) - (
Y_colsums**2
Y_sq_colsums = utils.matrix_sum(
utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan
)
var_x = N * X_sq_colsums - X_colsums**2
var_y = N * Y_sq_colsums - Y_colsums**2
if ignore_nan:
# now that we have the variance computed we can fill in the NaNs
X = utils.fillna(X, 0)
Y = utils.fillna(Y, 0)
N_times_sum_xy = utils.toarray(N * Y.T.dot(X))
sum_x_times_sum_y = X_colsums * Y_colsums[:, None]
# Finally compute Pearson Correlation Coefficient as 2D array
cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None])
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="invalid value encountered in true_divide",
category=RuntimeWarning,
)
cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None])
return cor.T


Expand Down
43 changes: 35 additions & 8 deletions scprep/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,31 @@ def matrix_transform(data, fun, *args, **kwargs):
return data


def matrix_sum(data, axis=None):
def fillna(data, fill, copy=True):
return_cls = None
if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)):
return_cls = type(data)
assert copy, f"Cannot fillna in-place for {return_cls.__name__}"
data = data.tocsr()
elif copy:
data = data.copy()
if sparse.issparse(data):
data.data[np.isnan(data.data)] = fill
if return_cls is not None:
data = return_cls(data)
else:
data[np.isnan(data)] = fill
return data


def _nansum(data, axis=None):
if sparse.issparse(data):
return np.sum(fillna(data, 0), axis=axis)
else:
return np.nansum(data, axis=axis)


def matrix_sum(data, axis=None, ignore_nan=False):
"""Get the column-wise, row-wise, or total sum of values in a matrix.

Parameters
Expand All @@ -388,37 +412,40 @@ def matrix_sum(data, axis=None):
axis : int or None, optional (default: None)
Axis across which to sum. axis=0 gives column sums,
axis=1 gives row sums. None gives the total sum.
ignore_nan : bool, optional (default: False)
If True, uses `np.nansum` instead of `np.sum`

Returns
-------
sums : array-like or float
Sums along desired axis.
"""
sum_fn = _nansum if ignore_nan else np.sum
if axis not in [0, 1, None]:
raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis))
if isinstance(data, pd.DataFrame):
if is_SparseDataFrame(data):
if axis is None:
sums = data.to_coo().sum()
sums = sum_fn(data.to_coo())
else:
index = data.index if axis == 1 else data.columns
sums = pd.Series(
np.array(data.to_coo().sum(axis)).flatten(), index=index
np.array(sum_fn(data.to_coo(), axis)).flatten(), index=index
)
elif is_sparse_dataframe(data):
if axis is None:
sums = data.sparse.to_coo().sum()
sums = sum_fn(data.sparse.to_coo())
else:
index = data.index if axis == 1 else data.columns
sums = pd.Series(
np.array(data.sparse.to_coo().sum(axis)).flatten(), index=index
np.array(sum_fn(data.sparse.to_coo(), axis)).flatten(), index=index
)
elif axis is None:
sums = data.to_numpy().sum()
sums = sum_fn(data.to_numpy())
else:
sums = data.sum(axis)
sums = sum_fn(data, axis)
else:
sums = np.sum(data, axis=axis)
sums = sum_fn(data, axis=axis)
if isinstance(sums, np.matrix):
sums = np.array(sums).flatten()
return sums
Expand Down
31 changes: 30 additions & 1 deletion test/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import numpy as np
import os
import scipy.sparse
import scprep
import warnings

Expand Down Expand Up @@ -93,7 +94,7 @@ def test_fun(X, *args, **kwargs):
)

D = data.generate_positive_sparse_matrix(shape=(500, 100), seed=42, poisson_mean=5)
Y = test_fun(D)
Y = np.corrcoef(D.T)[:, :10]
assert Y.shape == (D.shape[1], 10)
assert np.allclose(Y[(np.arange(10), np.arange(10))], 1, atol=0)
matrix.test_all_matrix_types(
Expand Down Expand Up @@ -126,6 +127,34 @@ def test_fun(X, *args, **kwargs):
)


def test_pairwise_correlation_nan():
D = np.array([np.arange(10), np.arange(0, 20, 2), np.zeros(10)]).astype(float).T
D[3, :] = np.nan

def test_with_nan(D):
C = scprep.stats.pairwise_correlation(D, D)
assert np.all(np.isnan(C))

matrix.test_all_matrix_types(
D,
test_with_nan,
)

def test_with_ignore_nan(D):
C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True)
# should still be NaN on samples that have no variance
assert np.all(np.isnan(C[-1]))
assert np.all(np.isnan(C[:, -1]))
# but shouldn't be NaN on samples that have some NaNs
assert not np.any(np.isnan(C[:2][:, :2]))
np.testing.assert_equal(C[:2][:, :2], 1)

matrix.test_all_matrix_types(
D,
test_with_ignore_nan,
)


def shan_entropy(c):
c_normalized = c / float(np.sum(c))
return stats.entropy(c_normalized[c_normalized != 0])
Expand Down