support rank deficiency in prediction (#676)

* support rank deficiency in prediction * add NEWS * oops * Update NEWS.md
JuliaStats · Apr 12, 2023 · 251ef8a · 251ef8a · palday · Apr 12, 2023
1 parent e5d8a6a
commit 251ef8a
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 7 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,6 @@
 MixedModels v4.10.0 Release Notes
 ==============================
+* Rank deficiency in prediction is now supported, both when the original model was fit to rank-deficient data and when the new data are rank deficient. The behavior is consistent but may be surprising when both old and new data are rank deficient. See the `predict` docstring for an example. [#676]
 * Multithreading in `parametricbootstrap` with `use_threads` is now deprecated and a noop. With improvements in BLAS threading, multithreading at the Julia level did not help performance and sometimes hurt it. [#674]
 
 MixedModels v4.9.0 Release Notes
@@ -405,3 +406,4 @@ Package dependencies
 [#665]: https://github.com/JuliaStats/MixedModels.jl/issues/665
 [#667]: https://github.com/JuliaStats/MixedModels.jl/issues/667
 [#674]: https://github.com/JuliaStats/MixedModels.jl/issues/674
+[#676]: https://github.com/JuliaStats/MixedModels.jl/issues/676
diff --git a/src/predict.jl b/src/predict.jl
@@ -18,13 +18,20 @@ Predict response for new data.
     `missing` is not valid because `missing` data are dropped before
     constructing the model matrices.
 
-!!! warning
-    Models are assumed to be full rank.
-
 !!! warning
     These methods construct an entire MixedModel behind the scenes and
     as such may use a large amount of memory when `newdata` is large.
 
+!!! warning
+    Rank-deficiency can lead to surprising but consistent behavior.
+    For example, if there are two perfectly collinear predictors `A`
+    and `B` (e.g. constant multiples of each other), then it is possible
+    that `A` will be pivoted out in the fitted model and thus the
+    associated coefficient is set to zero. If predictions are then
+    generated on new data where `B` has been set to zero but `A` has
+    not, then there will no contribution from neither `A` nor `B`
+    in the resulting predictions.
+
 The keyword argument `new_re_levels` specifies how previously unobserved
 values of the grouping variable are handled. Possible values are:
 
@@ -63,7 +70,7 @@ Similarly, offsets are also not supported for `GeneralizedLinearMixedModel`.
 function StatsAPI.predict(
     m::LinearMixedModel, newdata::Tables.ColumnTable; new_re_levels=:missing
 )
-    return _predict(m, newdata, m.β; new_re_levels)
+    return _predict(m, newdata, coef(m)[m.feterm.piv]; new_re_levels)
 end
 
 function StatsAPI.predict(
@@ -73,13 +80,14 @@ function StatsAPI.predict(
     type=:response,
 )
     type in (:linpred, :response) || throw(ArgumentError("Invalid value for type: $(type)"))
-
-    y = _predict(m.LMM, newdata, m.β; new_re_levels)
+    # want pivoted but not truncated
+    y = _predict(m.LMM, newdata, coef(m)[m.feterm.piv]; new_re_levels)
 
     return type == :linpred ? y : broadcast!(Base.Fix1(linkinv, Link(m)), y, y)
 end
 
 # β is separated out here because m.β != m.LMM.β depending on how β is estimated for GLMM
+# also β should already be pivoted but NOT truncated in the rank deficient case
 function _predict(m::MixedModel{T}, newdata, β; new_re_levels) where {T}
     new_re_levels in (:population, :missing, :error) ||
         throw(ArgumentError("Invalid value for new_re_levels: $(new_re_levels)"))
@@ -118,8 +126,9 @@ function _predict(m::MixedModel{T}, newdata, β; new_re_levels) where {T}
         ytemp, lmm
     end
 
+    pivotmatch = mnew.feterm.piv[m.feterm.piv]
     grps = fnames(m)
-    mul!(y, mnew.X, β)
+    mul!(y, view(mnew.X, :, pivotmatch), β)
     # mnew.reterms for the correct Z matrices
     # ranef(m) for the BLUPs from the original fit
 

diff --git a/test/predict.jl b/test/predict.jl
@@ -81,6 +81,61 @@ end
         @test_throws ArgumentError predict(m, slpm)
         fill!(slpm.reaction, missing)
         @test_throws ArgumentError predict(m, slpm)
+
+    end
+
+    @testset "rank deficiency" begin
+        @testset "in original fit" begin
+            refvals = predict(first(models(:sleepstudy)), slp)
+
+            slprd = transform(slp, :days => ByRow(x -> 2x) => :days2)
+            m = fit(MixedModel, @formula(reaction ~ 1 + days + days2 + (1|subj)), slprd; progress=false)
+            # predict assumes that X is the correct length and stored pivoted
+            # so these first two tests will fail if we change that storage detail
+            @test size(m.X) == (180, 3)
+            @test all(2 .* view(m.X, :, m.feterm.piv[2]) .== view(m.X, :, m.feterm.piv[3]))
+            @test predict(m, slprd) == refvals
+
+            slprd0 = transform(slp, :days => zero => :days0)
+            m = fit(MixedModel, @formula(reaction ~ 1 + days0 + days + (1|subj)), slprd0; progress=false)
+            @test predict(m, slprd0) == refvals
+            # change the formula order slightly so that the original ordering and hence the
+            # permutation vector for pivoting isn't identical
+            m = fit(MixedModel, @formula(reaction ~ 1 + days + days0 + (1|subj)), slprd0; progress=false)
+            @test predict(m, slprd0) == refvals
+        end
+
+        @testset "in newdata" begin
+            mref = first(models(:sleepstudy))
+            # remove days
+            refvals = fitted(mref) .- view(mref.X, :, 2) * mref.β[2]
+            slp0 = transform(slp, :days => zero => :days)
+            vals = predict(mref, slp0)
+            @test all(refvals .≈ vals)
+        end
+
+        @testset "in both" begin
+            # now what happens when old and new are rank deficient
+            mref = first(models(:sleepstudy))
+            # remove days
+            refvals = fitted(mref) .- view(mref.X, :, 2) * mref.β[2]
+            # days gets pivoted out
+            slprd = transform(slp, :days => ByRow(x -> 2x) => :days2)
+            m = fit(MixedModel, @formula(reaction ~ 1 + days + days2 + (1|subj)), slprd; progress=false)
+            # days2 gets pivoted out
+            slp0 = transform(slp, :days => zero => :days2)
+            vals = predict(m, slp0)
+            # but in the original fit, days gets pivoted out, so its coef is zero
+            # and now we have a column of zeros for days2
+            # leaving us with only the intercept
+            # this is consistent behavior
+            @test all(refvals .≈ vals)
+
+            slp1 = transform(slp, :days => ByRow(one) => :days2)
+            vals = predict(m, slp1)
+            refvals = fitted(mref) .- view(mref.X, :, 2) * mref.β[2] .+ last(fixef(m))
+            @test all(refvals .≈ vals)
+        end
     end
 
     @testset "transformed response" begin
@@ -124,4 +179,5 @@ end
             @test predict(gm0, contra; type=:response) ≈ gm0.resp.mu rtol=0.01
         end
     end
+
 end