Skip to content

Commit 2456763

Browse files
committed
Early stall condition detection after remediation
Detect stall condition due to exhausted remediation retry right after remediating. This helps return from AtomicRelease.Reconcile() with proper stalled status condition and error. Without this, after remediation, a stalled condition detection required a new reconciliation, leaving the status of the object without any Reconciling or Stalled condition. Signed-off-by: Sunny <darkowlzz@protonmail.com>
1 parent 6921825 commit 2456763

File tree

2 files changed

+63
-1
lines changed

2 files changed

+63
-1
lines changed

internal/reconcile/atomic_release.go

+9-1
Original file line numberDiff line numberDiff line change
@@ -262,10 +262,18 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error {
262262
"instructed to stop after running %s action reconciler %s", next.Type(), next.Name()),
263263
)
264264

265-
if remediation := req.Object.GetActiveRemediation(); remediation == nil || !remediation.RetriesExhausted(req.Object) {
265+
remediation := req.Object.GetActiveRemediation()
266+
if remediation == nil || !remediation.RetriesExhausted(req.Object) {
266267
conditions.MarkReconciling(req.Object, meta.ProgressingWithRetryReason, conditions.GetMessage(req.Object, meta.ReadyCondition))
267268
return ErrMustRequeue
268269
}
270+
// Check if retries have exhausted after remediation for early
271+
// stall condition detection.
272+
if remediation != nil && remediation.RetriesExhausted(req.Object) {
273+
conditions.MarkStalled(req.Object, "RetriesExceeded", "Failed to %s after %d attempt(s)",
274+
req.Object.Status.LastAttemptedReleaseAction, req.Object.GetActiveRemediation().GetFailureCount(req.Object))
275+
return ErrExceededMaxRetries
276+
}
269277

270278
conditions.Delete(req.Object, meta.ReconcilingCondition)
271279
return nil

internal/reconcile/atomic_release_test.go

+54
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) {
629629
release.ObservedToSnapshot(release.ObserveRelease(releases[0])),
630630
}
631631
},
632+
wantErr: ErrExceededMaxRetries,
632633
},
633634
{
634635
name: "install test failure with remediation",
@@ -654,6 +655,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) {
654655
snap,
655656
}
656657
},
658+
wantErr: ErrExceededMaxRetries,
657659
},
658660
{
659661
name: "install test failure with test ignore",
@@ -759,6 +761,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) {
759761
release.ObservedToSnapshot(release.ObserveRelease(releases[0])),
760762
}
761763
},
764+
wantErr: ErrExceededMaxRetries,
762765
},
763766
{
764767
name: "upgrade failure with uninstall remediation",
@@ -799,6 +802,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) {
799802
release.ObservedToSnapshot(release.ObserveRelease(releases[0])),
800803
}
801804
},
805+
wantErr: ErrExceededMaxRetries,
802806
},
803807
{
804808
name: "upgrade test failure with remediation",
@@ -841,6 +845,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) {
841845
release.ObservedToSnapshot(release.ObserveRelease(releases[0])),
842846
}
843847
},
848+
wantErr: ErrExceededMaxRetries,
844849
},
845850
{
846851
name: "upgrade test failure with test ignore",
@@ -928,6 +933,55 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) {
928933
},
929934
wantErr: ErrExceededMaxRetries,
930935
},
936+
{
937+
name: "upgrade remediation results in exhausted retries",
938+
releases: func(namespace string) []*helmrelease.Release {
939+
return []*helmrelease.Release{
940+
testutil.BuildRelease(&helmrelease.MockReleaseOptions{
941+
Name: mockReleaseName,
942+
Namespace: namespace,
943+
Version: 1,
944+
Chart: testutil.BuildChart(),
945+
Status: helmrelease.StatusSuperseded,
946+
}),
947+
testutil.BuildRelease(&helmrelease.MockReleaseOptions{
948+
Name: mockReleaseName,
949+
Namespace: namespace,
950+
Version: 2,
951+
Chart: testutil.BuildChart(),
952+
Status: helmrelease.StatusSuperseded,
953+
}),
954+
testutil.BuildRelease(&helmrelease.MockReleaseOptions{
955+
Name: mockReleaseName,
956+
Namespace: namespace,
957+
Version: 3,
958+
Chart: testutil.BuildChart(),
959+
Status: helmrelease.StatusFailed,
960+
}),
961+
}
962+
},
963+
spec: func(spec *v2.HelmReleaseSpec) {
964+
spec.Upgrade = &v2.Upgrade{
965+
Remediation: &v2.UpgradeRemediation{
966+
Retries: 1,
967+
},
968+
}
969+
},
970+
status: func(namespace string, releases []*helmrelease.Release) v2.HelmReleaseStatus {
971+
return v2.HelmReleaseStatus{
972+
History: v2.Snapshots{
973+
release.ObservedToSnapshot(release.ObserveRelease(releases[2])),
974+
release.ObservedToSnapshot(release.ObserveRelease(releases[1])),
975+
release.ObservedToSnapshot(release.ObserveRelease(releases[0])),
976+
},
977+
LastAttemptedReleaseAction: v2.ReleaseActionUpgrade,
978+
Failures: 2,
979+
UpgradeFailures: 2,
980+
}
981+
},
982+
chart: testutil.BuildChart(),
983+
wantErr: ErrExceededMaxRetries,
984+
},
931985
}
932986
for _, tt := range tests {
933987
t.Run(tt.name, func(t *testing.T) {

0 commit comments

Comments
 (0)