From f3d7cf19e7684986f64bead9940fb553d13a6b69 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Mon, 2 Oct 2017 21:38:21 -0700 Subject: [PATCH] Added support for sparse data matrices. (#25) --- src/GLMNet.jl | 115 +++++++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 48 ++++++++++++++++++++ 2 files changed, 163 insertions(+) diff --git a/src/GLMNet.jl b/src/GLMNet.jl index 40fca84..8151345 100644 --- a/src/GLMNet.jl +++ b/src/GLMNet.jl @@ -283,6 +283,37 @@ function glmnet!(X::Matrix{Float64}, y::Vector{Float64}, @check_and_return end +function glmnet!(X::SparseMatrixCSC{Float64,Int32}, y::Vector{Float64}, + family::Normal=Normal(); + weights::Vector{Float64}=ones(length(y)), + naivealgorithm::Bool=(size(X, 2) >= 500), alpha::Real=1.0, + penalty_factor::Vector{Float64}=ones(size(X, 2)), + constraints::Array{Float64, 2}=[x for x in (-Inf, Inf), y in 1:size(X, 2)], + dfmax::Int=size(X, 2), pmax::Int=min(dfmax*2+20, size(X, 2)), nlambda::Int=100, + lambda_min_ratio::Real=(length(y) < size(X, 2) ? 1e-2 : 1e-4), + lambda::Vector{Float64}=Float64[], tol::Real=1e-7, standardize::Bool=true, + intercept::Bool=true, maxit::Int=1000000) + @validate_and_init + + ccall((:spelnet_, libglmnet), Void, + (Ref{Int32}, Ref{Float64}, Ref{Int32}, Ref{Int32}, Ptr{Float64}, Ptr{Int32}, + Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ref{Int32}, Ptr{Float64}, Ptr{Float64}, + Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ptr{Float64}, Ref{Float64}, + Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, + Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ptr{Int32}, Ptr{Int32}), + (naivealgorithm ? 2 : 1), alpha, size(X, 1), size(X, 2), X.nzval, X.colptr, + X.rowval, y, weights, 0, penalty_factor, constraints, dfmax, pmax, nlambda, + lambda_min_ratio, lambda, tol, standardize, intercept, maxit, lmu, a0, ca, ia, + nin, fdev, alm, nlp, jerr) + + null_dev = 0.0 + mu = mean(y) + for i = 1:length(y) + null_dev += abs2(null_dev-mu) + end + + @check_and_return +end function glmnet!(X::Matrix{Float64}, y::Matrix{Float64}, family::Binomial; @@ -330,6 +361,54 @@ function glmnet!(X::Matrix{Float64}, y::Matrix{Float64}, null_dev = null_dev[1] @check_and_return end +function glmnet!(X::SparseMatrixCSC{Float64,Int32}, y::Matrix{Float64}, + family::Binomial; + offsets::Union{Vector{Float64},Void}=nothing, + weights::Vector{Float64}=ones(size(y, 1)), + alpha::Real=1.0, + penalty_factor::Vector{Float64}=ones(size(X, 2)), + constraints::Array{Float64, 2}=[x for x in (-Inf, Inf), y in 1:size(X, 2)], + dfmax::Int=size(X, 2), pmax::Int=min(dfmax*2+20, size(X, 2)), nlambda::Int=100, + lambda_min_ratio::Real=(length(y) < size(X, 2) ? 1e-2 : 1e-4), + lambda::Vector{Float64}=Float64[], tol::Real=1e-7, standardize::Bool=true, + intercept::Bool=true, maxit::Int=1000000, algorithm::Symbol=:newtonraphson) + @validate_and_init + size(y, 2) == 2 || error("glmnet for logistic models requires a two-column matrix with "* + "counts of negative responses in the first column and positive "* + "responses in the second") + kopt = algorithm == :newtonraphson ? 0 : + algorithm == :modifiednewtonraphson ? 1 : + algorithm == :nzsame ? 2 : error("unknown algorithm ") + offsets::Vector{Float64} = isa(offsets, @compat Void) ? zeros(size(y, 1)) : copy(offsets) + length(offsets) == size(y, 1) || error("length of offsets must match length of y") + + null_dev = Vector{Float64}(1) + + # The Fortran code expects positive responses in first column, but + # this convention is evidently unacceptable to the authors of the R + # code, and, apparently, to us + for i = 1:size(y, 1) + a = y[i, 1] + b = y[i, 2] + y[i, 1] = b*weights[i] + y[i, 2] = a*weights[i] + end + + ccall((:splognet_, libglmnet), Void, + (Ref{Float64}, Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Float64}, Ptr{Int32}, + Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ref{Int32}, Ptr{Float64}, Ptr{Float64}, + Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ptr{Float64}, Ref{Float64}, + Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Int32}, Ptr{Float64}, + Ptr{Float64}, Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, + Ptr{Int32}, Ptr{Int32}), + alpha, size(X, 1), size(X, 2), 1, X.nzval, X.colptr, X.rowval, y, copy(offsets), + 0, penalty_factor, constraints, dfmax, pmax, nlambda, lambda_min_ratio, lambda, + tol, standardize, intercept, maxit, kopt, lmu, a0, ca, ia, nin, null_dev, fdev, + alm, nlp, jerr) + + null_dev = null_dev[1] + @check_and_return +end function glmnet!(X::Matrix{Float64}, y::Vector{Float64}, family::Poisson; @@ -361,13 +440,49 @@ function glmnet!(X::Matrix{Float64}, y::Vector{Float64}, null_dev = null_dev[1] @check_and_return end +function glmnet!(X::SparseMatrixCSC{Float64,Int32}, y::Vector{Float64}, + family::Poisson; + offsets::Union{Vector{Float64},Void}=nothing, + weights::Vector{Float64}=ones(length(y)), + alpha::Real=1.0, + penalty_factor::Vector{Float64}=ones(size(X, 2)), + constraints::Array{Float64, 2}=[x for x in (-Inf, Inf), y in 1:size(X, 2)], + dfmax::Int=size(X, 2), pmax::Int=min(dfmax*2+20, size(X, 2)), nlambda::Int=100, + lambda_min_ratio::Real=(length(y) < size(X, 2) ? 1e-2 : 1e-4), + lambda::Vector{Float64}=Float64[], tol::Real=1e-7, standardize::Bool=true, + intercept::Bool=true, maxit::Int=1000000) + @validate_and_init + null_dev = Vector{Float64}(1) + + offsets::Vector{Float64} = isa(offsets, Void) ? zeros(length(y)) : copy(offsets) + length(offsets) == length(y) || error("length of offsets must match length of y") + + ccall((:spfishnet_, libglmnet), Void, + (Ref{Float64}, Ref{Int32}, Ref{Int32}, Ptr{Float64}, Ptr{Int32}, Ptr{Int32}, + Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Int32}, Ptr{Float64}, Ptr{Float64}, + Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ptr{Float64}, Ref{Float64}, + Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, + Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ptr{Int32}, + Ptr{Int32}), + alpha, size(X, 1), size(X, 2), X.nzval, X.colptr, X.rowval, y, offsets, weights, + 0, penalty_factor, constraints, dfmax, pmax, nlambda, lambda_min_ratio, lambda, + tol, standardize, intercept, maxit, lmu, a0, ca, ia, nin, null_dev, fdev, alm, + nlp, jerr) + + null_dev = null_dev[1] + @check_and_return +end glmnet(X::Matrix{Float64}, y::Vector{Float64}, family::Distribution=Normal(); kw...) = glmnet!(copy(X), copy(y), family; kw...) glmnet(X::AbstractMatrix, y::AbstractVector, family::Distribution=Normal(); kw...) = glmnet(convert(Matrix{Float64}, X), convert(Vector{Float64}, y), family; kw...) +glmnet(X::SparseMatrixCSC, y::AbstractVector, family::Distribution=Normal(); kw...) = + glmnet!(convert(SparseMatrixCSC{Float64,Int32}, X), convert(Vector{Float64}, y), family; kw...) glmnet(X::Matrix{Float64}, y::Matrix{Float64}, family::Binomial; kw...) = glmnet!(copy(X), copy(y), family; kw...) +glmnet(X::SparseMatrixCSC, y::AbstractMatrix, family::Binomial; kw...) = + glmnet!(convert(SparseMatrixCSC{Float64,Int32}, X), convert(Matrix{Float64}, y), family; kw...) glmnet(X::Matrix, y::Matrix, family::Binomial; kw...) = glmnet(convert(Matrix{Float64}, X), convert(Matrix{Float64}, y), family; kw...) diff --git a/test/runtests.jl b/test/runtests.jl index 5c8d363..9d6579d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -85,6 +85,22 @@ path = glmnet(X, y) 69.396354069847447,12.253877034216755,81.104769545494065, 17.808632244707766] +# sparse +path = glmnet(sparse(X), y) +@test nactive(path.betas) == df_true +@test path.dev_ratio ≈ dev_ratio +@test path.lambda ≈ lambda +@test path.a0 ≈ a0 +@test path.betas[:, models] ≈ betas +@test predict(path, X, 16) ≈ [25.67451533276341,62.52776614976348,78.11952611080198, + 69.61492976841734,20.00478443784032,58.98418434043655, + 64.65391523535965,25.67451533276341,77.41080974893660, + 14.33505354291723] +@test predict(path, X, 62) ≈ [9.688463955335449,65.513664866328625,89.537586892649699, + 84.299096349896985,5.711287928399321,61.686267113123805, + 69.396354069847447,12.253877034216755,81.104769545494065, + 17.808632244707766] + # Cross-validation cv = glmnetcv(X, y; folds=[1,1,1,1,2,2,2,3,3,3]) @test cv.meanloss ≈ [1196.1831818915,1054.30217435069,882.722957995572,741.473677317198, @@ -211,6 +227,22 @@ path = glmnet(X, convert(Matrix{Float64}, yl), Binomial()) 6.514278565882654,-5.352339338181535,10.448596815251006, -8.571939893775767] +# sparse +path = glmnet(sparse(X), yl, Binomial()) +@test nactive(path.betas) == df_true +@test path.dev_ratio ≈ dev_ratio +@test path.lambda ≈ lambda +@test path.a0 ≈ a0 +@test path.betas[:, models] ≈ betas +@test predict(path, X, 16) ≈ [-1.315519391606169,1.722425698139390,3.007710159185589, + 2.306645907705844,-1.782895559259332,1.430315593356164, + 1.897691761009327,-1.315519391606169,2.949288138228943, + -2.250271726912495] +@test predict(path, X, 62) ≈ [-5.328152634222764,5.936301834664929,10.640103977849353, + 8.017225144937120,-6.891307125813680,5.068602350662909, + 6.514278565882654,-5.352339338181535,10.448596815251006, + -8.571939893775767] + # Cross-validation cv = glmnetcv(X, yl, Binomial(); folds=[1,1,1,1,2,2,2,3,3,3]) @test cv.meanloss ≈ [1.49234494227531,1.38615874813194,1.21206306885012, @@ -356,6 +388,22 @@ path = glmnet(X, y, Poisson()) path = glmnet([1 1; 2 2; 3 4], [0, 0, 1], Poisson()) @test !any(isnan, GLMNet.loss(path, [1 1; 2 2; 4 4], [0, 0, 1])) +# sparse +path = glmnet(sparse(X), y, Poisson()) +@test nactive(path.betas) == df_true +@test path.dev_ratio ≈ dev_ratio +@test path.lambda ≈ lambda +@test path.a0 ≈ a0 +@test path.betas[:, models] ≈ betas +@test predict(path, X, 16) ≈ [3.195043339285695,4.063275663211828,4.430604723334422, + 4.230243417813007,3.061469135604752,3.979791785911238, + 4.113365989592181,3.195043339285695,4.413907947874304, + 2.927894931923808] +@test predict(path, X, 62) ≈ [2.108607974403907,4.125962319203899,4.481867295351227, + 4.492300995443095,2.410180465556811,4.082152789977005, + 4.183424906852268,2.381251991983247,4.446875861428943, + 2.829218161240957] + # Cross-validation cv = glmnetcv(X, y, Poisson(); folds=[1,1,1,1,2,2,2,3,3,3]) @test cv.meanloss ≈ [29.7515432302351,26.9128224177608,23.2024175329887,