Skip to content

Commit

Permalink
Added support for sparse data matrices. (#25)
Browse files Browse the repository at this point in the history
  • Loading branch information
ararslan authored Oct 3, 2017
1 parent c6c46f5 commit f3d7cf1
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 0 deletions.
115 changes: 115 additions & 0 deletions src/GLMNet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,37 @@ function glmnet!(X::Matrix{Float64}, y::Vector{Float64},

@check_and_return
end
function glmnet!(X::SparseMatrixCSC{Float64,Int32}, y::Vector{Float64},
family::Normal=Normal();
weights::Vector{Float64}=ones(length(y)),
naivealgorithm::Bool=(size(X, 2) >= 500), alpha::Real=1.0,
penalty_factor::Vector{Float64}=ones(size(X, 2)),
constraints::Array{Float64, 2}=[x for x in (-Inf, Inf), y in 1:size(X, 2)],
dfmax::Int=size(X, 2), pmax::Int=min(dfmax*2+20, size(X, 2)), nlambda::Int=100,
lambda_min_ratio::Real=(length(y) < size(X, 2) ? 1e-2 : 1e-4),
lambda::Vector{Float64}=Float64[], tol::Real=1e-7, standardize::Bool=true,
intercept::Bool=true, maxit::Int=1000000)
@validate_and_init

ccall((:spelnet_, libglmnet), Void,
(Ref{Int32}, Ref{Float64}, Ref{Int32}, Ref{Int32}, Ptr{Float64}, Ptr{Int32},
Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ref{Int32}, Ptr{Float64}, Ptr{Float64},
Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ptr{Float64}, Ref{Float64},
Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64},
Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ptr{Int32}, Ptr{Int32}),
(naivealgorithm ? 2 : 1), alpha, size(X, 1), size(X, 2), X.nzval, X.colptr,
X.rowval, y, weights, 0, penalty_factor, constraints, dfmax, pmax, nlambda,
lambda_min_ratio, lambda, tol, standardize, intercept, maxit, lmu, a0, ca, ia,
nin, fdev, alm, nlp, jerr)

null_dev = 0.0
mu = mean(y)
for i = 1:length(y)
null_dev += abs2(null_dev-mu)
end

@check_and_return
end

function glmnet!(X::Matrix{Float64}, y::Matrix{Float64},
family::Binomial;
Expand Down Expand Up @@ -330,6 +361,54 @@ function glmnet!(X::Matrix{Float64}, y::Matrix{Float64},
null_dev = null_dev[1]
@check_and_return
end
function glmnet!(X::SparseMatrixCSC{Float64,Int32}, y::Matrix{Float64},
family::Binomial;
offsets::Union{Vector{Float64},Void}=nothing,
weights::Vector{Float64}=ones(size(y, 1)),
alpha::Real=1.0,
penalty_factor::Vector{Float64}=ones(size(X, 2)),
constraints::Array{Float64, 2}=[x for x in (-Inf, Inf), y in 1:size(X, 2)],
dfmax::Int=size(X, 2), pmax::Int=min(dfmax*2+20, size(X, 2)), nlambda::Int=100,
lambda_min_ratio::Real=(length(y) < size(X, 2) ? 1e-2 : 1e-4),
lambda::Vector{Float64}=Float64[], tol::Real=1e-7, standardize::Bool=true,
intercept::Bool=true, maxit::Int=1000000, algorithm::Symbol=:newtonraphson)
@validate_and_init
size(y, 2) == 2 || error("glmnet for logistic models requires a two-column matrix with "*
"counts of negative responses in the first column and positive "*
"responses in the second")
kopt = algorithm == :newtonraphson ? 0 :
algorithm == :modifiednewtonraphson ? 1 :
algorithm == :nzsame ? 2 : error("unknown algorithm ")
offsets::Vector{Float64} = isa(offsets, @compat Void) ? zeros(size(y, 1)) : copy(offsets)
length(offsets) == size(y, 1) || error("length of offsets must match length of y")

null_dev = Vector{Float64}(1)

# The Fortran code expects positive responses in first column, but
# this convention is evidently unacceptable to the authors of the R
# code, and, apparently, to us
for i = 1:size(y, 1)
a = y[i, 1]
b = y[i, 2]
y[i, 1] = b*weights[i]
y[i, 2] = a*weights[i]
end

ccall((:splognet_, libglmnet), Void,
(Ref{Float64}, Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Float64}, Ptr{Int32},
Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ref{Int32}, Ptr{Float64}, Ptr{Float64},
Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ptr{Float64}, Ref{Float64},
Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Int32}, Ptr{Float64},
Ptr{Float64}, Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64},
Ptr{Int32}, Ptr{Int32}),
alpha, size(X, 1), size(X, 2), 1, X.nzval, X.colptr, X.rowval, y, copy(offsets),
0, penalty_factor, constraints, dfmax, pmax, nlambda, lambda_min_ratio, lambda,
tol, standardize, intercept, maxit, kopt, lmu, a0, ca, ia, nin, null_dev, fdev,
alm, nlp, jerr)

null_dev = null_dev[1]
@check_and_return
end

function glmnet!(X::Matrix{Float64}, y::Vector{Float64},
family::Poisson;
Expand Down Expand Up @@ -361,13 +440,49 @@ function glmnet!(X::Matrix{Float64}, y::Vector{Float64},
null_dev = null_dev[1]
@check_and_return
end
function glmnet!(X::SparseMatrixCSC{Float64,Int32}, y::Vector{Float64},
family::Poisson;
offsets::Union{Vector{Float64},Void}=nothing,
weights::Vector{Float64}=ones(length(y)),
alpha::Real=1.0,
penalty_factor::Vector{Float64}=ones(size(X, 2)),
constraints::Array{Float64, 2}=[x for x in (-Inf, Inf), y in 1:size(X, 2)],
dfmax::Int=size(X, 2), pmax::Int=min(dfmax*2+20, size(X, 2)), nlambda::Int=100,
lambda_min_ratio::Real=(length(y) < size(X, 2) ? 1e-2 : 1e-4),
lambda::Vector{Float64}=Float64[], tol::Real=1e-7, standardize::Bool=true,
intercept::Bool=true, maxit::Int=1000000)
@validate_and_init
null_dev = Vector{Float64}(1)

offsets::Vector{Float64} = isa(offsets, Void) ? zeros(length(y)) : copy(offsets)
length(offsets) == length(y) || error("length of offsets must match length of y")

ccall((:spfishnet_, libglmnet), Void,
(Ref{Float64}, Ref{Int32}, Ref{Int32}, Ptr{Float64}, Ptr{Int32}, Ptr{Int32},
Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Int32}, Ptr{Float64}, Ptr{Float64},
Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ptr{Float64}, Ref{Float64},
Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64},
Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ptr{Int32},
Ptr{Int32}),
alpha, size(X, 1), size(X, 2), X.nzval, X.colptr, X.rowval, y, offsets, weights,
0, penalty_factor, constraints, dfmax, pmax, nlambda, lambda_min_ratio, lambda,
tol, standardize, intercept, maxit, lmu, a0, ca, ia, nin, null_dev, fdev, alm,
nlp, jerr)

null_dev = null_dev[1]
@check_and_return
end

glmnet(X::Matrix{Float64}, y::Vector{Float64}, family::Distribution=Normal(); kw...) =
glmnet!(copy(X), copy(y), family; kw...)
glmnet(X::AbstractMatrix, y::AbstractVector, family::Distribution=Normal(); kw...) =
glmnet(convert(Matrix{Float64}, X), convert(Vector{Float64}, y), family; kw...)
glmnet(X::SparseMatrixCSC, y::AbstractVector, family::Distribution=Normal(); kw...) =
glmnet!(convert(SparseMatrixCSC{Float64,Int32}, X), convert(Vector{Float64}, y), family; kw...)
glmnet(X::Matrix{Float64}, y::Matrix{Float64}, family::Binomial; kw...) =
glmnet!(copy(X), copy(y), family; kw...)
glmnet(X::SparseMatrixCSC, y::AbstractMatrix, family::Binomial; kw...) =
glmnet!(convert(SparseMatrixCSC{Float64,Int32}, X), convert(Matrix{Float64}, y), family; kw...)
glmnet(X::Matrix, y::Matrix, family::Binomial; kw...) =
glmnet(convert(Matrix{Float64}, X), convert(Matrix{Float64}, y), family; kw...)

Expand Down
48 changes: 48 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,22 @@ path = glmnet(X, y)
69.396354069847447,12.253877034216755,81.104769545494065,
17.808632244707766]

# sparse
path = glmnet(sparse(X), y)
@test nactive(path.betas) == df_true
@test path.dev_ratio dev_ratio
@test path.lambda lambda
@test path.a0 a0
@test path.betas[:, models] betas
@test predict(path, X, 16) [25.67451533276341,62.52776614976348,78.11952611080198,
69.61492976841734,20.00478443784032,58.98418434043655,
64.65391523535965,25.67451533276341,77.41080974893660,
14.33505354291723]
@test predict(path, X, 62) [9.688463955335449,65.513664866328625,89.537586892649699,
84.299096349896985,5.711287928399321,61.686267113123805,
69.396354069847447,12.253877034216755,81.104769545494065,
17.808632244707766]

# Cross-validation
cv = glmnetcv(X, y; folds=[1,1,1,1,2,2,2,3,3,3])
@test cv.meanloss [1196.1831818915,1054.30217435069,882.722957995572,741.473677317198,
Expand Down Expand Up @@ -211,6 +227,22 @@ path = glmnet(X, convert(Matrix{Float64}, yl), Binomial())
6.514278565882654,-5.352339338181535,10.448596815251006,
-8.571939893775767]

# sparse
path = glmnet(sparse(X), yl, Binomial())
@test nactive(path.betas) == df_true
@test path.dev_ratio dev_ratio
@test path.lambda lambda
@test path.a0 a0
@test path.betas[:, models] betas
@test predict(path, X, 16) [-1.315519391606169,1.722425698139390,3.007710159185589,
2.306645907705844,-1.782895559259332,1.430315593356164,
1.897691761009327,-1.315519391606169,2.949288138228943,
-2.250271726912495]
@test predict(path, X, 62) [-5.328152634222764,5.936301834664929,10.640103977849353,
8.017225144937120,-6.891307125813680,5.068602350662909,
6.514278565882654,-5.352339338181535,10.448596815251006,
-8.571939893775767]

# Cross-validation
cv = glmnetcv(X, yl, Binomial(); folds=[1,1,1,1,2,2,2,3,3,3])
@test cv.meanloss [1.49234494227531,1.38615874813194,1.21206306885012,
Expand Down Expand Up @@ -356,6 +388,22 @@ path = glmnet(X, y, Poisson())
path = glmnet([1 1; 2 2; 3 4], [0, 0, 1], Poisson())
@test !any(isnan, GLMNet.loss(path, [1 1; 2 2; 4 4], [0, 0, 1]))

# sparse
path = glmnet(sparse(X), y, Poisson())
@test nactive(path.betas) == df_true
@test path.dev_ratio dev_ratio
@test path.lambda lambda
@test path.a0 a0
@test path.betas[:, models] betas
@test predict(path, X, 16) [3.195043339285695,4.063275663211828,4.430604723334422,
4.230243417813007,3.061469135604752,3.979791785911238,
4.113365989592181,3.195043339285695,4.413907947874304,
2.927894931923808]
@test predict(path, X, 62) [2.108607974403907,4.125962319203899,4.481867295351227,
4.492300995443095,2.410180465556811,4.082152789977005,
4.183424906852268,2.381251991983247,4.446875861428943,
2.829218161240957]

# Cross-validation
cv = glmnetcv(X, y, Poisson(); folds=[1,1,1,1,2,2,2,3,3,3])
@test cv.meanloss [29.7515432302351,26.9128224177608,23.2024175329887,
Expand Down

0 comments on commit f3d7cf1

Please sign in to comment.