Skip to content

Commit 9516c0f

Browse files
authored
⚠ Recover panics per default (#2905)
* Recover panics per default Signed-off-by: Stefan Büringer buringerst@vmware.com * fix review findings --------- Signed-off-by: Stefan Büringer buringerst@vmware.com
1 parent 250454b commit 9516c0f

File tree

10 files changed

+158
-24
lines changed

10 files changed

+158
-24
lines changed

pkg/builder/webhook.go

+24-7
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ type WebhookBuilder struct {
4242
gvk schema.GroupVersionKind
4343
mgr manager.Manager
4444
config *rest.Config
45-
recoverPanic bool
45+
recoverPanic *bool
4646
logConstructor func(base logr.Logger, req *admission.Request) logr.Logger
4747
err error
4848
}
@@ -84,8 +84,9 @@ func (blder *WebhookBuilder) WithLogConstructor(logConstructor func(base logr.Lo
8484
}
8585

8686
// RecoverPanic indicates whether panics caused by the webhook should be recovered.
87-
func (blder *WebhookBuilder) RecoverPanic() *WebhookBuilder {
88-
blder.recoverPanic = true
87+
// Defaults to true.
88+
func (blder *WebhookBuilder) RecoverPanic(recoverPanic bool) *WebhookBuilder {
89+
blder.recoverPanic = &recoverPanic
8990
return blder
9091
}
9192

@@ -169,10 +170,18 @@ func (blder *WebhookBuilder) registerDefaultingWebhook() {
169170

170171
func (blder *WebhookBuilder) getDefaultingWebhook() *admission.Webhook {
171172
if defaulter := blder.customDefaulter; defaulter != nil {
172-
return admission.WithCustomDefaulter(blder.mgr.GetScheme(), blder.apiType, defaulter).WithRecoverPanic(blder.recoverPanic)
173+
w := admission.WithCustomDefaulter(blder.mgr.GetScheme(), blder.apiType, defaulter)
174+
if blder.recoverPanic != nil {
175+
w = w.WithRecoverPanic(*blder.recoverPanic)
176+
}
177+
return w
173178
}
174179
if defaulter, ok := blder.apiType.(admission.Defaulter); ok {
175-
return admission.DefaultingWebhookFor(blder.mgr.GetScheme(), defaulter).WithRecoverPanic(blder.recoverPanic)
180+
w := admission.DefaultingWebhookFor(blder.mgr.GetScheme(), defaulter)
181+
if blder.recoverPanic != nil {
182+
w = w.WithRecoverPanic(*blder.recoverPanic)
183+
}
184+
return w
176185
}
177186
log.Info(
178187
"skip registering a mutating webhook, object does not implement admission.Defaulter or WithDefaulter wasn't called",
@@ -200,10 +209,18 @@ func (blder *WebhookBuilder) registerValidatingWebhook() {
200209

201210
func (blder *WebhookBuilder) getValidatingWebhook() *admission.Webhook {
202211
if validator := blder.customValidator; validator != nil {
203-
return admission.WithCustomValidator(blder.mgr.GetScheme(), blder.apiType, validator).WithRecoverPanic(blder.recoverPanic)
212+
w := admission.WithCustomValidator(blder.mgr.GetScheme(), blder.apiType, validator)
213+
if blder.recoverPanic != nil {
214+
w = w.WithRecoverPanic(*blder.recoverPanic)
215+
}
216+
return w
204217
}
205218
if validator, ok := blder.apiType.(admission.Validator); ok {
206-
return admission.ValidatingWebhookFor(blder.mgr.GetScheme(), validator).WithRecoverPanic(blder.recoverPanic)
219+
w := admission.ValidatingWebhookFor(blder.mgr.GetScheme(), validator)
220+
if blder.recoverPanic != nil {
221+
w = w.WithRecoverPanic(*blder.recoverPanic)
222+
}
223+
return w
207224
}
208225
log.Info(
209226
"skip registering a validating webhook, object does not implement admission.Validator or WithValidator wasn't called",

pkg/builder/webhook_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ func runTests(admissionReviewVersion string) {
160160

161161
err = WebhookManagedBy(m).
162162
For(&TestDefaulter{Panic: true}).
163-
RecoverPanic().
163+
// RecoverPanic defaults to true.
164164
Complete()
165165
ExpectWithOffset(1, err).NotTo(HaveOccurred())
166166
svr := m.GetWebhookServer()
@@ -369,7 +369,7 @@ func runTests(admissionReviewVersion string) {
369369

370370
err = WebhookManagedBy(m).
371371
For(&TestValidator{Panic: true}).
372-
RecoverPanic().
372+
RecoverPanic(true).
373373
Complete()
374374
ExpectWithOffset(1, err).NotTo(HaveOccurred())
375375
svr := m.GetWebhookServer()

pkg/config/controller.go

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ type Controller struct {
4141

4242
// RecoverPanic indicates whether the panic caused by reconcile should be recovered.
4343
// Defaults to the Controller.RecoverPanic setting from the Manager if unset.
44+
// Defaults to true if Controller.RecoverPanic setting from the Manager is also unset.
4445
RecoverPanic *bool
4546

4647
// NeedLeaderElection indicates whether the controller needs to use leader election.

pkg/controller/controller.go

+1
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ type TypedOptions[request comparable] struct {
4545

4646
// RecoverPanic indicates whether the panic caused by reconcile should be recovered.
4747
// Defaults to the Controller.RecoverPanic setting from the Manager if unset.
48+
// Defaults to true if Controller.RecoverPanic setting from the Manager is also unset.
4849
RecoverPanic *bool
4950

5051
// NeedLeaderElection indicates whether the controller needs to use leader election.

pkg/internal/controller/controller.go

+8-3
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ type Controller[request comparable] struct {
8787
LogConstructor func(request *request) logr.Logger
8888

8989
// RecoverPanic indicates whether the panic caused by reconcile should be recovered.
90+
// Defaults to true.
9091
RecoverPanic *bool
9192

9293
// LeaderElected indicates whether the controller is leader elected or always running.
@@ -97,7 +98,9 @@ type Controller[request comparable] struct {
9798
func (c *Controller[request]) Reconcile(ctx context.Context, req request) (_ reconcile.Result, err error) {
9899
defer func() {
99100
if r := recover(); r != nil {
100-
if c.RecoverPanic != nil && *c.RecoverPanic {
101+
ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Inc()
102+
103+
if c.RecoverPanic == nil || *c.RecoverPanic {
101104
for _, fn := range utilruntime.PanicHandlers {
102105
fn(ctx, r)
103106
}
@@ -269,13 +272,15 @@ const (
269272
)
270273

271274
func (c *Controller[request]) initMetrics() {
272-
ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Set(0)
273-
ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Add(0)
274275
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelError).Add(0)
275276
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeueAfter).Add(0)
276277
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeue).Add(0)
277278
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelSuccess).Add(0)
279+
ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Add(0)
280+
ctrlmetrics.TerminalReconcileErrors.WithLabelValues(c.Name).Add(0)
281+
ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Add(0)
278282
ctrlmetrics.WorkerCount.WithLabelValues(c.Name).Set(float64(c.MaxConcurrentReconciles))
283+
ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Set(0)
279284
}
280285

281286
func (c *Controller[request]) reconcileHandler(ctx context.Context, req request) {

pkg/internal/controller/controller_test.go

+20-1
Original file line numberDiff line numberDiff line change
@@ -90,13 +90,14 @@ var _ = Describe("controller", func() {
9090
Expect(result).To(Equal(reconcile.Result{Requeue: true}))
9191
})
9292

93-
It("should not recover panic if RecoverPanic is false by default", func() {
93+
It("should not recover panic if RecoverPanic is false", func() {
9494
ctx, cancel := context.WithCancel(context.Background())
9595
defer cancel()
9696

9797
defer func() {
9898
Expect(recover()).ShouldNot(BeNil())
9999
}()
100+
ctrl.RecoverPanic = ptr.To(false)
100101
ctrl.Do = reconcile.Func(func(context.Context, reconcile.Request) (reconcile.Result, error) {
101102
var res *reconcile.Result
102103
return *res, nil
@@ -105,6 +106,24 @@ var _ = Describe("controller", func() {
105106
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
106107
})
107108

109+
It("should recover panic if RecoverPanic is true by default", func() {
110+
ctx, cancel := context.WithCancel(context.Background())
111+
defer cancel()
112+
113+
defer func() {
114+
Expect(recover()).To(BeNil())
115+
}()
116+
// RecoverPanic defaults to true.
117+
ctrl.Do = reconcile.Func(func(context.Context, reconcile.Request) (reconcile.Result, error) {
118+
var res *reconcile.Result
119+
return *res, nil
120+
})
121+
_, err := ctrl.Reconcile(ctx,
122+
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
123+
Expect(err).To(HaveOccurred())
124+
Expect(err.Error()).To(ContainSubstring("[recovered]"))
125+
})
126+
108127
It("should recover panic if RecoverPanic is true", func() {
109128
ctx, cancel := context.WithCancel(context.Background())
110129
defer cancel()

pkg/internal/controller/metrics/metrics.go

+8
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,13 @@ var (
4646
Help: "Total number of terminal reconciliation errors per controller",
4747
}, []string{"controller"})
4848

49+
// ReconcilePanics is a prometheus counter metrics which holds the total
50+
// number of panics from the Reconciler.
51+
ReconcilePanics = prometheus.NewCounterVec(prometheus.CounterOpts{
52+
Name: "controller_runtime_reconcile_panics_total",
53+
Help: "Total number of reconciliation panics per controller",
54+
}, []string{"controller"})
55+
4956
// ReconcileTime is a prometheus metric which keeps track of the duration
5057
// of reconciliations.
5158
ReconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
@@ -75,6 +82,7 @@ func init() {
7582
ReconcileTotal,
7683
ReconcileErrors,
7784
TerminalReconcileErrors,
85+
ReconcilePanics,
7886
ReconcileTime,
7987
WorkerCount,
8088
ActiveWorkers,
+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package metrics
18+
19+
import (
20+
"github.com/prometheus/client_golang/prometheus"
21+
"sigs.k8s.io/controller-runtime/pkg/metrics"
22+
)
23+
24+
var (
25+
// WebhookPanics is a prometheus counter metrics which holds the total
26+
// number of panics from webhooks.
27+
WebhookPanics = prometheus.NewCounterVec(prometheus.CounterOpts{
28+
Name: "controller_runtime_webhook_panics_total",
29+
Help: "Total number of webhook panics",
30+
}, []string{})
31+
)
32+
33+
func init() {
34+
metrics.Registry.MustRegister(
35+
WebhookPanics,
36+
)
37+
// Init metric.
38+
WebhookPanics.WithLabelValues().Add(0)
39+
}

pkg/webhook/admission/webhook.go

+23-8
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"k8s.io/apimachinery/pkg/util/json"
3131
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
3232
"k8s.io/klog/v2"
33+
admissionmetrics "sigs.k8s.io/controller-runtime/pkg/webhook/admission/metrics"
3334

3435
logf "sigs.k8s.io/controller-runtime/pkg/log"
3536
"sigs.k8s.io/controller-runtime/pkg/webhook/internal/metrics"
@@ -123,7 +124,8 @@ type Webhook struct {
123124
Handler Handler
124125

125126
// RecoverPanic indicates whether the panic caused by webhook should be recovered.
126-
RecoverPanic bool
127+
// Defaults to true.
128+
RecoverPanic *bool
127129

128130
// WithContextFunc will allow you to take the http.Request.Context() and
129131
// add any additional information such as passing the request path or
@@ -141,8 +143,9 @@ type Webhook struct {
141143
}
142144

143145
// WithRecoverPanic takes a bool flag which indicates whether the panic caused by webhook should be recovered.
146+
// Defaults to true.
144147
func (wh *Webhook) WithRecoverPanic(recoverPanic bool) *Webhook {
145-
wh.RecoverPanic = recoverPanic
148+
wh.RecoverPanic = &recoverPanic
146149
return wh
147150
}
148151

@@ -151,25 +154,37 @@ func (wh *Webhook) WithRecoverPanic(recoverPanic bool) *Webhook {
151154
// If the webhook is validating type, it delegates the AdmissionRequest to each handler and
152155
// deny the request if anyone denies.
153156
func (wh *Webhook) Handle(ctx context.Context, req Request) (response Response) {
154-
if wh.RecoverPanic {
155-
defer func() {
156-
if r := recover(); r != nil {
157+
defer func() {
158+
if r := recover(); r != nil {
159+
admissionmetrics.WebhookPanics.WithLabelValues().Inc()
160+
161+
if wh.RecoverPanic == nil || *wh.RecoverPanic {
157162
for _, fn := range utilruntime.PanicHandlers {
158163
fn(ctx, r)
159164
}
160165
response = Errored(http.StatusInternalServerError, fmt.Errorf("panic: %v [recovered]", r))
166+
// Note: We explicitly have to set the response UID. Usually that is done via resp.Complete below,
167+
// but if we encounter a panic in wh.Handler.Handle we are never going to reach resp.Complete.
168+
response.UID = req.UID
161169
return
162170
}
163-
}()
164-
}
171+
172+
log := logf.FromContext(ctx)
173+
log.Info(fmt.Sprintf("Observed a panic in webhook: %v", r))
174+
panic(r)
175+
}
176+
}()
165177

166178
reqLog := wh.getLogger(&req)
167179
ctx = logf.IntoContext(ctx, reqLog)
168180

169181
resp := wh.Handler.Handle(ctx, req)
170182
if err := resp.Complete(req); err != nil {
171183
reqLog.Error(err, "unable to encode response")
172-
return Errored(http.StatusInternalServerError, errUnableToEncodeResponse)
184+
resp := Errored(http.StatusInternalServerError, errUnableToEncodeResponse)
185+
// Note: We explicitly have to set the response UID. Usually that is done via resp.Complete.
186+
resp.UID = req.UID
187+
return resp
173188
}
174189

175190
return resp

pkg/webhook/admission/webhook_test.go

+32-3
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
authenticationv1 "k8s.io/api/authentication/v1"
3131
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3232
machinerytypes "k8s.io/apimachinery/pkg/types"
33+
"k8s.io/utils/ptr"
3334

3435
logf "sigs.k8s.io/controller-runtime/pkg/log"
3536
"sigs.k8s.io/controller-runtime/pkg/log/zap"
@@ -199,6 +200,33 @@ var _ = Describe("Admission Webhooks", func() {
199200
})
200201

201202
Describe("panic recovery", func() {
203+
It("should recover panic if RecoverPanic is true by default", func() {
204+
panicHandler := func() *Webhook {
205+
handler := &fakeHandler{
206+
fn: func(ctx context.Context, req Request) Response {
207+
panic("fake panic test")
208+
},
209+
}
210+
webhook := &Webhook{
211+
Handler: handler,
212+
// RecoverPanic defaults to true.
213+
}
214+
215+
return webhook
216+
}
217+
218+
By("setting up a webhook with a panicking handler")
219+
webhook := panicHandler()
220+
221+
By("invoking the webhook")
222+
resp := webhook.Handle(context.Background(), Request{})
223+
224+
By("checking that it errored the request")
225+
Expect(resp.Allowed).To(BeFalse())
226+
Expect(resp.Result.Code).To(Equal(int32(http.StatusInternalServerError)))
227+
Expect(resp.Result.Message).To(Equal("panic: fake panic test [recovered]"))
228+
})
229+
202230
It("should recover panic if RecoverPanic is true", func() {
203231
panicHandler := func() *Webhook {
204232
handler := &fakeHandler{
@@ -208,7 +236,7 @@ var _ = Describe("Admission Webhooks", func() {
208236
}
209237
webhook := &Webhook{
210238
Handler: handler,
211-
RecoverPanic: true,
239+
RecoverPanic: ptr.To[bool](true),
212240
}
213241

214242
return webhook
@@ -226,15 +254,16 @@ var _ = Describe("Admission Webhooks", func() {
226254
Expect(resp.Result.Message).To(Equal("panic: fake panic test [recovered]"))
227255
})
228256

229-
It("should not recover panic if RecoverPanic is false by default", func() {
257+
It("should not recover panic if RecoverPanic is false", func() {
230258
panicHandler := func() *Webhook {
231259
handler := &fakeHandler{
232260
fn: func(ctx context.Context, req Request) Response {
233261
panic("fake panic test")
234262
},
235263
}
236264
webhook := &Webhook{
237-
Handler: handler,
265+
Handler: handler,
266+
RecoverPanic: ptr.To[bool](false),
238267
}
239268

240269
return webhook

0 commit comments

Comments
 (0)