Skip to content

Commit 352b7f2

Browse files
authored
Merge pull request #628 from fluxcd/oom-watcher
Introduce OOM watcher to allow graceful shutdown
2 parents 154000e + c4566a5 commit 352b7f2

File tree

4 files changed

+467
-10
lines changed

4 files changed

+467
-10
lines changed

internal/features/features.go

+7
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ const (
3838
//
3939
// Ref: https://github.com/helm/helm/security/advisories/GHSA-pwcw-6f5g-gxf8
4040
AllowDNSLookups = "AllowDNSLookups"
41+
42+
// OOMWatch enables the OOM watcher, which will gracefully shut down the controller
43+
// when the memory usage exceeds the configured limit. This is disabled by default.
44+
OOMWatch = "OOMWatch"
4145
)
4246

4347
var features = map[string]bool{
@@ -50,6 +54,9 @@ var features = map[string]bool{
5054
// AllowDNSLookups
5155
// opt-in from v0.31
5256
AllowDNSLookups: false,
57+
// OOMWatch
58+
// opt-in from v0.31
59+
OOMWatch: false,
5360
}
5461

5562
// FeatureGates contains a list of all supported feature gates and

internal/oomwatch/watch.go

+172
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
/*
2+
Copyright 2023 The Flux authors
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
// Package oomwatch provides a way to detect near OOM conditions.
18+
package oomwatch
19+
20+
import (
21+
"context"
22+
"fmt"
23+
"os"
24+
"path/filepath"
25+
"strconv"
26+
"strings"
27+
"sync"
28+
"time"
29+
30+
"github.com/go-logr/logr"
31+
)
32+
33+
const (
34+
// DefaultCgroupPath is the default path to the cgroup directory.
35+
DefaultCgroupPath = "/sys/fs/cgroup/"
36+
// MemoryMaxFile is the cgroup memory.max filename.
37+
MemoryMaxFile = "memory.max"
38+
// MemoryCurrentFile is the cgroup memory.current filename.
39+
MemoryCurrentFile = "memory.current"
40+
)
41+
42+
// Watcher can be used to detect near OOM conditions.
43+
type Watcher struct {
44+
// memoryMax is the maximum amount of memory that can be used by the system.
45+
memoryMax uint64
46+
// memoryCurrentPath is the cgroup memory.current filepath.
47+
memoryCurrentPath string
48+
// memoryUsagePercentThreshold is the threshold at which the system is
49+
// considered to be near OOM.
50+
memoryUsagePercentThreshold uint8
51+
// interval is the interval at which to check for OOM.
52+
interval time.Duration
53+
// logger is the logger to use.
54+
logger logr.Logger
55+
56+
// ctx is the context that is canceled when OOM is detected.
57+
ctx context.Context
58+
// cancel is the function that cancels the context.
59+
cancel context.CancelFunc
60+
// once is used to ensure that Watch is only called once.
61+
once sync.Once
62+
}
63+
64+
// New returns a new Watcher.
65+
func New(memoryMaxPath, memoryCurrentPath string, memoryUsagePercentThreshold uint8, interval time.Duration, logger logr.Logger) (*Watcher, error) {
66+
if memoryUsagePercentThreshold < 1 || memoryUsagePercentThreshold > 100 {
67+
return nil, fmt.Errorf("memory usage percent threshold must be between 1 and 100, got %d", memoryUsagePercentThreshold)
68+
}
69+
70+
if minInterval := 50 * time.Millisecond; interval < minInterval {
71+
return nil, fmt.Errorf("interval must be at least %s, got %s", minInterval, interval)
72+
}
73+
74+
if _, err := os.Lstat(memoryCurrentPath); err != nil {
75+
return nil, fmt.Errorf("failed to stat memory.current %q: %w", memoryCurrentPath, err)
76+
}
77+
78+
memoryMax, err := readUintFromFile(memoryMaxPath)
79+
if err != nil {
80+
return nil, fmt.Errorf("failed to read memory.max %q: %w", memoryMaxPath, err)
81+
}
82+
83+
return &Watcher{
84+
memoryMax: memoryMax,
85+
memoryCurrentPath: memoryCurrentPath,
86+
memoryUsagePercentThreshold: memoryUsagePercentThreshold,
87+
interval: interval,
88+
logger: logger,
89+
}, nil
90+
}
91+
92+
// NewDefault returns a new Watcher with default path values.
93+
func NewDefault(memoryUsagePercentThreshold uint8, interval time.Duration, logger logr.Logger) (*Watcher, error) {
94+
return New(
95+
filepath.Join(DefaultCgroupPath, MemoryMaxFile),
96+
filepath.Join(DefaultCgroupPath, MemoryCurrentFile),
97+
memoryUsagePercentThreshold,
98+
interval,
99+
logger,
100+
)
101+
}
102+
103+
// Watch returns a context that is canceled when the system reaches the
104+
// configured memory usage threshold. Calling Watch multiple times will return
105+
// the same context.
106+
func (w *Watcher) Watch(ctx context.Context) context.Context {
107+
w.once.Do(func() {
108+
w.ctx, w.cancel = context.WithCancel(ctx)
109+
go w.watchForNearOOM(ctx)
110+
})
111+
return w.ctx
112+
}
113+
114+
// watchForNearOOM polls the memory.current file on the configured interval
115+
// and cancels the context within Watcher when the system is near OOM.
116+
// It is expected that this function is called in a goroutine. Canceling
117+
// provided context will cause the goroutine to exit.
118+
func (w *Watcher) watchForNearOOM(ctx context.Context) {
119+
t := time.NewTicker(w.interval)
120+
defer t.Stop()
121+
122+
for {
123+
select {
124+
case <-ctx.Done():
125+
w.logger.Info("Shutdown signal received, stopping watch for near OOM")
126+
return
127+
case <-t.C:
128+
current, err := readUintFromFile(w.memoryCurrentPath)
129+
if err != nil {
130+
w.logger.Error(err, "Failed to read current memory usage, skipping check")
131+
continue
132+
}
133+
134+
currentPercentage := float64(current) / float64(w.memoryMax) * 100
135+
if currentPercentage >= float64(w.memoryUsagePercentThreshold) {
136+
w.logger.Info(fmt.Sprintf("Memory usage is near OOM (%s/%s), shutting down",
137+
formatSize(current), formatSize(w.memoryMax)))
138+
w.cancel()
139+
return
140+
}
141+
w.logger.V(2).Info(fmt.Sprintf("Current memory usage %s/%s (%.2f%% out of %d%%)",
142+
formatSize(current), formatSize(w.memoryMax), currentPercentage, w.memoryUsagePercentThreshold))
143+
}
144+
}
145+
}
146+
147+
// readUintFromFile reads an uint64 from the file at the given path.
148+
func readUintFromFile(path string) (uint64, error) {
149+
b, err := os.ReadFile(path)
150+
if err != nil {
151+
return 0, err
152+
}
153+
return strconv.ParseUint(strings.TrimSpace(string(b)), 10, 64)
154+
}
155+
156+
// formatSize formats the given size in bytes to a human-readable format.
157+
func formatSize(b uint64) string {
158+
if b == 0 {
159+
return "-"
160+
}
161+
const unit = 1024
162+
if b < unit {
163+
return fmt.Sprintf("%d B", b)
164+
}
165+
div, exp := uint64(unit), 0
166+
for n := b / unit; n >= unit; n /= unit {
167+
div *= unit
168+
exp++
169+
}
170+
return fmt.Sprintf("%.1f %ciB",
171+
float64(b)/float64(div), "KMGTPE"[exp])
172+
}

0 commit comments

Comments
 (0)