Skip to content

Commit 304cbed

Browse files
committed
Add OOM watcher to allow graceful shutdown
Signed-off-by: Hidde Beydals <hidde@hhh.computer>
1 parent 90a03d0 commit 304cbed

File tree

3 files changed

+190
-10
lines changed

3 files changed

+190
-10
lines changed

internal/features/features.go

+7
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ const (
3838
//
3939
// Ref: https://github.com/helm/helm/security/advisories/GHSA-pwcw-6f5g-gxf8
4040
AllowDNSLookups = "AllowDNSLookups"
41+
42+
// OOMWatch enables the OOM watcher, which will gracefully shut down the controller
43+
// when the memory usage exceeds the configured limit. This is disabled by default.
44+
OOMWatch = "OOMWatch"
4145
)
4246

4347
var features = map[string]bool{
@@ -50,6 +54,9 @@ var features = map[string]bool{
5054
// AllowDNSLookups
5155
// opt-in from v0.31
5256
AllowDNSLookups: false,
57+
// OOMWatch
58+
// opt-in from v0.31
59+
OOMWatch: false,
5360
}
5461

5562
// FeatureGates contains a list of all supported feature gates and

internal/oomwatch/watch.go

+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
package oomwatch
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"os"
7+
"path/filepath"
8+
"strconv"
9+
"strings"
10+
"sync"
11+
"time"
12+
13+
"github.com/go-logr/logr"
14+
)
15+
16+
const (
17+
// DefaultCgroupPath is the default path to the cgroup directory.
18+
DefaultCgroupPath = "/sys/fs/cgroup/"
19+
// MemoryMaxFile is the cgroup memory.max filename.
20+
MemoryMaxFile = "memory.max"
21+
// MemoryCurrentFile is the cgroup memory.current filename.
22+
MemoryCurrentFile = "memory.current"
23+
)
24+
25+
// Watcher can be used to detect near OOM conditions.
26+
type Watcher struct {
27+
// memoryMax is the maximum amount of memory that can be used by the system.
28+
memoryMax uint64
29+
// memoryCurrentPath is the cgroup memory.current filepath.
30+
memoryCurrentPath string
31+
// memoryUsagePercentThreshold is the threshold at which the system is
32+
// considered to be near OOM.
33+
memoryUsagePercentThreshold float64
34+
// interval is the interval at which to check for OOM.
35+
interval time.Duration
36+
// logger is the logger to use.
37+
logger logr.Logger
38+
39+
// ctx is the context that is canceled when OOM is detected.
40+
ctx context.Context
41+
// cancel is the function that cancels the context.
42+
cancel context.CancelFunc
43+
// once is used to ensure that Watch is only called once.
44+
once sync.Once
45+
}
46+
47+
// New returns a new Watcher.
48+
func New(memoryMaxPath, memoryCurrentPath string, memoryUsagePercentThreshold float64, interval time.Duration, logger logr.Logger) (*Watcher, error) {
49+
if memoryUsagePercentThreshold < 1 || memoryUsagePercentThreshold > 100 {
50+
return nil, fmt.Errorf("memory usage percent threshold must be between 1 and 100, got %f", memoryUsagePercentThreshold)
51+
}
52+
memoryMax, err := readUintFromFile(memoryMaxPath)
53+
if err != nil {
54+
return nil, err
55+
}
56+
return &Watcher{
57+
memoryMax: memoryMax,
58+
memoryCurrentPath: memoryCurrentPath,
59+
memoryUsagePercentThreshold: memoryUsagePercentThreshold,
60+
interval: interval,
61+
logger: logger,
62+
}, nil
63+
}
64+
65+
// NewDefault returns a new Watcher with default path values.
66+
func NewDefault(memoryUsagePercentThreshold float64, interval time.Duration, logger logr.Logger) (*Watcher, error) {
67+
return New(
68+
filepath.Join(DefaultCgroupPath, MemoryMaxFile),
69+
filepath.Join(DefaultCgroupPath, MemoryCurrentFile),
70+
memoryUsagePercentThreshold,
71+
interval,
72+
logger,
73+
)
74+
}
75+
76+
// Watch returns a context that is canceled when the system reaches the
77+
// configured memory usage threshold. Calling Watch multiple times will return
78+
// the same context.
79+
func (w *Watcher) Watch(ctx context.Context) context.Context {
80+
w.once.Do(func() {
81+
w.ctx, w.cancel = context.WithCancel(ctx)
82+
go w.watchForNearOOM(ctx)
83+
})
84+
return w.ctx
85+
}
86+
87+
// watchForNearOOM polls the memory.current file on the configured interval
88+
// and cancels the context within Watcher when the system is near OOM.
89+
// It is expected that this function is called in a goroutine. Canceling
90+
// provided context will cause the goroutine to exit.
91+
func (w *Watcher) watchForNearOOM(ctx context.Context) {
92+
t := time.NewTicker(w.interval)
93+
defer t.Stop()
94+
95+
for {
96+
select {
97+
case <-ctx.Done():
98+
w.logger.Info("Shutdown signal received, stopping watch for near OOM")
99+
return
100+
case <-t.C:
101+
current, err := readUintFromFile(w.memoryCurrentPath)
102+
if err != nil {
103+
w.logger.Error(err, "Failed to read current memory usage from "+w.memoryCurrentPath)
104+
continue
105+
}
106+
107+
currentPercentage := float64(current) / float64(w.memoryMax) * 100
108+
if currentPercentage >= w.memoryUsagePercentThreshold {
109+
w.logger.Info(fmt.Sprintf("Memory usage is near OOM (%s/%s), shutting down",
110+
formatSize(current), formatSize(w.memoryMax)))
111+
w.cancel()
112+
return
113+
}
114+
w.logger.V(2).Info(fmt.Sprintf("Current memory usage %s/%s (%.2f%% out of %.2f%%)",
115+
formatSize(current), formatSize(w.memoryMax), currentPercentage, w.memoryUsagePercentThreshold))
116+
}
117+
}
118+
}
119+
120+
// readUintFromFile reads an uint64 from the file at the given path.
121+
func readUintFromFile(path string) (uint64, error) {
122+
b, err := os.ReadFile(path)
123+
if err != nil {
124+
return 0, err
125+
}
126+
return strconv.ParseUint(strings.TrimSpace(string(b)), 10, 64)
127+
}
128+
129+
// formatSize formats the given size in bytes to a human-readable format.
130+
func formatSize(b uint64) string {
131+
if b == 0 {
132+
return "-"
133+
}
134+
const unit = 1024
135+
if b < unit {
136+
return fmt.Sprintf("%d B", b)
137+
}
138+
div, exp := uint64(unit), 0
139+
for n := b / unit; n >= unit; n /= unit {
140+
div *= unit
141+
exp++
142+
}
143+
return fmt.Sprintf("%.1f %ciB",
144+
float64(b)/float64(div), "KMGTPE"[exp])
145+
}

main.go

+38-10
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package main
1818

1919
import (
2020
"fmt"
21+
"github.com/fluxcd/helm-controller/internal/oomwatch"
2122
"os"
2223
"time"
2324

@@ -84,25 +85,41 @@ func main() {
8485
aclOptions acl.Options
8586
leaderElectionOptions leaderelection.Options
8687
rateLimiterOptions helper.RateLimiterOptions
88+
oomWatchInterval time.Duration
89+
oomWatchMemoryThreshold float64
8790
)
8891

89-
flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.")
90-
flag.StringVar(&eventsAddr, "events-addr", "", "The address of the events receiver.")
91-
flag.StringVar(&healthAddr, "health-addr", ":9440", "The address the health endpoint binds to.")
92-
flag.IntVar(&concurrent, "concurrent", 4, "The number of concurrent HelmRelease reconciles.")
93-
flag.DurationVar(&requeueDependency, "requeue-dependency", 30*time.Second, "The interval at which failing dependencies are reevaluated.")
94-
flag.DurationVar(&gracefulShutdownTimeout, "graceful-shutdown-timeout", 600*time.Second, "The duration given to the reconciler to finish before forcibly stopping.")
92+
flag.StringVar(&metricsAddr, "metrics-addr", ":8080",
93+
"The address the metric endpoint binds to.")
94+
flag.StringVar(&eventsAddr, "events-addr", "",
95+
"The address of the events receiver.")
96+
flag.StringVar(&healthAddr, "health-addr", ":9440",
97+
"The address the health endpoint binds to.")
98+
flag.IntVar(&concurrent, "concurrent", 4,
99+
"The number of concurrent HelmRelease reconciles.")
100+
flag.DurationVar(&requeueDependency, "requeue-dependency", 30*time.Second,
101+
"The interval at which failing dependencies are reevaluated.")
102+
flag.DurationVar(&gracefulShutdownTimeout, "graceful-shutdown-timeout", 600*time.Second,
103+
"The duration given to the reconciler to finish before forcibly stopping.")
95104
flag.BoolVar(&watchAllNamespaces, "watch-all-namespaces", true,
96105
"Watch for custom resources in all namespaces, if set to false it will only watch the runtime namespace.")
97-
flag.IntVar(&httpRetry, "http-retry", 9, "The maximum number of retries when failing to fetch artifacts over HTTP.")
98-
flag.StringVar(&intkube.DefaultServiceAccountName, "default-service-account", "", "Default service account used for impersonation.")
106+
flag.IntVar(&httpRetry, "http-retry", 9,
107+
"The maximum number of retries when failing to fetch artifacts over HTTP.")
108+
flag.StringVar(&intkube.DefaultServiceAccountName, "default-service-account", "",
109+
"Default service account used for impersonation.")
110+
flag.Float64Var(&oomWatchMemoryThreshold, "oom-watch-memory-threshold", 95,
111+
"The memory threshold in percentage at which the OOM watcher will trigger a graceful shutdown. Requires feature gate 'OOMWatch' to be enabled.")
112+
flag.DurationVar(&oomWatchInterval, "oom-watch-interval", 500*time.Millisecond,
113+
"The interval at which the OOM watcher will check for memory usage. Requires feature gate 'OOMWatch' to be enabled.")
114+
99115
clientOptions.BindFlags(flag.CommandLine)
100116
logOptions.BindFlags(flag.CommandLine)
101117
aclOptions.BindFlags(flag.CommandLine)
102118
leaderElectionOptions.BindFlags(flag.CommandLine)
103119
rateLimiterOptions.BindFlags(flag.CommandLine)
104120
kubeConfigOpts.BindFlags(flag.CommandLine)
105121
featureGates.BindFlags(flag.CommandLine)
122+
106123
flag.Parse()
107124

108125
ctrl.SetLogger(logger.NewLogger(logOptions))
@@ -122,7 +139,7 @@ func main() {
122139
watchNamespace = os.Getenv("RUNTIME_NAMESPACE")
123140
}
124141

125-
disableCacheFor := []ctrlclient.Object{}
142+
var disableCacheFor []ctrlclient.Object
126143
shouldCache, err := features.Enabled(features.CacheSecretsAndConfigMaps)
127144
if err != nil {
128145
setupLog.Error(err, "unable to check feature gate CacheSecretsAndConfigMaps")
@@ -190,8 +207,19 @@ func main() {
190207
}
191208
// +kubebuilder:scaffold:builder
192209

210+
ctx := ctrl.SetupSignalHandler()
211+
if ok, _ := features.Enabled(features.OOMWatch); ok {
212+
setupLog.Info("setting up OOM watcher")
213+
ow, err := oomwatch.NewDefault(oomWatchMemoryThreshold, oomWatchInterval, ctrl.Log.WithName("OOMwatch"))
214+
if err != nil {
215+
setupLog.Error(err, "unable to setup OOM watcher")
216+
os.Exit(1)
217+
}
218+
ctx = ow.Watch(ctx)
219+
}
220+
193221
setupLog.Info("starting manager")
194-
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
222+
if err := mgr.Start(ctx); err != nil {
195223
setupLog.Error(err, "problem running manager")
196224
os.Exit(1)
197225
}

0 commit comments

Comments
 (0)