-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathSparCC.R
143 lines (143 loc) · 5.12 KB
/
SparCC.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
################################################################################
# File: SparCC.R
# Aim : SparCC
#-------------------------------------------------------------------------------
# Author: Fang Huaying (Peking University)
# Email : hyfang@pku.edu.cn
# Date : 11/12/2014
#-------------------------------------------------------------------------------
# SparCC for counts known
# function: SparCC.count
# input:
# x ------ nxp count data matrix, row is sample, col is variable
# imax ------ resampling times from posterior distribution. default 20
# kmax ------ max iteration steps for SparCC. default is 10
# alpha ------ the threshold for strong correlation. default is 0.1
# Vmin ------ minimal variance if negative variance appears. default is 1e-4
# output: a list structure
# cov.w ------ covariance estimation
# cor.w ------ correlation estimation
#
require(gtools);
SparCC.count <- function(x, imax = 20, kmax = 10, alpha = 0.1, Vmin = 1e-4) {
# dimension for w (latent variables)
p <- ncol(x);
n <- nrow(x);
# posterior distribution (alpha)
x <- x + 1;
# store generate data
y <- matrix(0, n, p);
# store covariance/correlation matrix
cov.w <- cor.w <- matrix(0, p, p);
indLow <- lower.tri(cov.w, diag = T);
# store covariance/correlation for several posterior samples
covs <- cors <- matrix(0, p * (p + 1) / 2, imax);
for(i in 1:imax) {
# generate fractions from posterior distribution
y <- t(apply(x, 1, function(x)
gtools::rdirichlet(n = 1, alpha = x)));
# estimate covariance/correlation
cov_cor <- SparCC.frac(x = y, kmax = kmax, alpha = alpha, Vmin = Vmin);
# store variance/correlation only low triangle
covs[, i] <- cov_cor$cov.w[indLow];
cors[, i] <- cov_cor$cor.w[indLow];
}
# calculate median for several posterior samples
cov.w[indLow] <- apply(covs, 1, median);
cor.w[indLow] <- apply(cors, 1, median);
#
cov.w <- cov.w + t(cov.w);
diag(cov.w) <- diag(cov.w) / 2;
cor.w <- cor.w + t(cor.w);
diag(cor.w) <- 1;
#
return(list(cov.w = cov.w, cor.w = cor.w));
}
#-------------------------------------------------------------------------------
# SparCC for fractions known
# function: SparCC.frac
# input:
# x ------ nxp fraction data matrix, row is sample, col is variable
# kmax ------ max iteration steps for SparCC. default is 10
# alpha ------ the threshold for strong correlation. default is 0.1
# Vmin ------ minimal variance if negative variance appears. default is 1e-4
# output: a list structure
# cov.w ------ covariance estimation
# cor.w ------ correlation estimation
SparCC.frac <- function(x, kmax = 10, alpha = 0.1, Vmin = 1e-4) {
# Log transformation
x <- log(x);
p <- ncol(x);
# T0 = var(log(xi/xj)) variation matrix
TT <- stats::var(x);
T0 <- diag(TT) + rep(diag(TT), each = p) - 2 * TT;
# Variance and correlation coefficients for Basic SparCC
rowT0 <- rowSums(T0);
var.w <- (rowT0 - sum(rowT0) / (2 * p - 2))/(p - 2);
var.w[var.w < Vmin] <- Vmin;
#cor.w <- (outer(var.w, var.w, "+") - T0 ) /
# sqrt(outer(var.w, var.w, "*")) / 2;
Is <- sqrt(1/var.w);
cor.w <- (var.w + rep(var.w, each = p) - T0) * Is * rep(Is, each = p) * 0.5;
# Truncated correlation in [-1, 1]
cor.w[cor.w <= - 1] <- - 1;
cor.w[cor.w >= 1] <- 1;
# Left matrix of estimation equation
Lmat <- diag(rep(p - 2, p)) + 1;
# Remove pairs
rp <- NULL;
# Left components
cp <- rep(TRUE, p);
# Do loops until max iteration or only 3 components left
k <- 0;
while(k < kmax && sum(cp) > 3) {
# Left T0 = var(log(xi/xj)) after removing pairs
T02 <- T0;
# Store current correlation to find the strongest pair
curr_cor.w <- cor.w;
# Remove diagonal
diag(curr_cor.w) <- 0;
# Remove removed pairs
if(!is.null(rp)) {
curr_cor.w[rp] <- 0;
}
# Find the strongest pair in vector form
n_rp <- which.max(abs(curr_cor.w));
# Remove the pair if geater than alpha
if(abs(curr_cor.w[n_rp]) >= alpha) {
# Which pair in matrix form
t_id <- c(arrayInd(n_rp, .dim = c(p, p)));
Lmat[t_id, t_id] <- Lmat[t_id, t_id] - 1;
# Update remove pairs
n_rp <- c(n_rp, (p + 1) * sum(t_id) - 2 * p - n_rp);
rp <- c(rp, n_rp);
# Update T02
T02[rp] <- 0;
# Which component left
cp <- (diag(Lmat) > 0);
# Update variance and truncated lower by Vmin
var.w[cp] <- solve(Lmat[cp, cp], rowSums(T02[cp, cp]));
var.w[var.w <= Vmin] <- Vmin;
# Update correlation matrix and truncated by [-1, 1]
#cor.w <- (outer(var.w, var.w, "+") - T0 ) /
# sqrt(outer(var.w, var.w, "*")) / 2;
Is <- sqrt(1/var.w);
cor.w <- (var.w + rep(var.w, each = p) - T0) *
Is * rep(Is, each = p) * 0.5;
# Truncated correlation in [-1, 1]
cor.w[cor.w <= - 1] <- - 1;
cor.w[cor.w >= 1] <- 1;
}
else {
break;
}
#
k <- k + 1;
}
# Covariance
Is <- sqrt(var.w);
cov.w <- cor.w * Is * rep(Is, each = p);
#
return(list(cov.w = cov.w, cor.w = cor.w));
}
#-------------------------------------------------------------------------------