Skip to content

Commit 5db5a77

Browse files
committed
Move retry logic into pkg/libs/endpointcheck
We will need the same check in another PR.
1 parent 76f7fd4 commit 5db5a77

4 files changed

Lines changed: 213 additions & 209 deletions

File tree

pkg/libs/endpointaccessible/endpoint_accessible_controller.go

Lines changed: 7 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import (
1212

1313
"github.com/cenkalti/backoff/v4"
1414

15-
"github.com/openshift/library-go/pkg/operator/resource/retry"
1615
apierrors "k8s.io/apimachinery/pkg/api/errors"
1716
utilerrors "k8s.io/apimachinery/pkg/util/errors"
1817
"k8s.io/apimachinery/pkg/util/wait"
@@ -22,16 +21,12 @@ import (
2221
"github.com/openshift/library-go/pkg/controller/factory"
2322
"github.com/openshift/library-go/pkg/operator/events"
2423
"github.com/openshift/library-go/pkg/operator/v1helpers"
24+
25+
"github.com/openshift/cluster-authentication-operator/pkg/libs/endpointcheck"
2526
)
2627

27-
// The following constants are put together so that
28-
// all attempts fit safely into resyncInterval.
2928
const (
3029
resyncInterval = 1 * time.Minute
31-
32-
defaultRequestTimeout = 5 * time.Second
33-
defaultRetryInterval = 2 * time.Second
34-
defaultAttemptCount = 3
3530
)
3631

3732
type endpointAccessibleController struct {
@@ -42,16 +37,10 @@ type endpointAccessibleController struct {
4237
availableConditionName string
4338
endpointCheckDisabledFunc EndpointCheckDisabledFunc
4439
// httpClient overrides the default TLS client when set; used in tests.
45-
httpClient *http.Client
46-
// requestTimeout is the per-request context timeout.
47-
// Defaults to defaultRequestTimeout when unset.
40+
httpClient *http.Client
4841
requestTimeout time.Duration
49-
// retryInterval is the sleep duration between retry attempts.
50-
// Defaults to defaultRetryInterval when unset.
51-
retryInterval time.Duration
52-
// attemptCount is the maximum number of fetch+check cycles.
53-
// Defaults to defaultAttemptCount when unset.
54-
attemptCount int
42+
retryInterval time.Duration
43+
attemptCount uint64
5544
}
5645

5746
type EndpointListFunc func() ([]string, error)
@@ -125,26 +114,12 @@ func (c *endpointAccessibleController) sync(ctx context.Context, syncCtx factory
125114
// Retry the full fetch+check cycle so that stale pod IPs from a rolling
126115
// upgrade are replaced with fresh ones as soon as the Endpoints object is
127116
// updated between attempts.
128-
attempts := c.attemptCount
129-
if attempts <= 0 {
130-
attempts = defaultAttemptCount
131-
}
132-
requestTimeout := c.requestTimeout
133-
if requestTimeout <= 0 {
134-
requestTimeout = defaultRequestTimeout
135-
}
136-
retryInterval := c.retryInterval
137-
if retryInterval <= 0 {
138-
retryInterval = defaultRetryInterval
139-
}
140-
141-
// Assemble a check function to use with backoff.Retry.
142117
var (
143118
endpoints []string
144119
endpointListErr error
145120
errs []error
146121
)
147-
checkFn := func() error {
122+
checkFn := func(ctx context.Context, requestTimeout time.Duration) error {
148123
var err error
149124
endpoints, err = c.endpointListFn()
150125
if err != nil {
@@ -199,17 +174,7 @@ func (c *endpointAccessibleController) sync(ctx context.Context, syncCtx factory
199174
return errors.New("no endpoints")
200175
}
201176

202-
// Run checkFn given number of times. Getting a timeout from checkFn causes an immediate retry,
203-
// we don't wait another retryInterval before performing the next check.
204-
skippableBoff := retry.NewSkippableBackOff(backoff.NewConstantBackOff(retryInterval))
205-
boff := backoff.WithContext(backoff.WithMaxRetries(skippableBoff, uint64(attempts-1)), ctx)
206-
_ = backoff.Retry(func() error {
207-
err := checkFn()
208-
if errors.Is(err, context.DeadlineExceeded) {
209-
skippableBoff.SkipNext()
210-
}
211-
return err
212-
}, boff)
177+
_ = endpointcheck.Check(ctx, c.requestTimeout, c.retryInterval, c.attemptCount, checkFn)
213178

214179
if err := endpointListErr; err != nil {
215180
if apierrors.IsNotFound(err) {

pkg/libs/endpointaccessible/endpoint_accessible_controller_test.go

Lines changed: 1 addition & 167 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,14 @@ import (
77
"net/http/httptest"
88
"sync/atomic"
99
"testing"
10-
"testing/synctest"
1110
"time"
1211

1312
clocktesting "k8s.io/utils/clock/testing"
1413

1514
operatorv1 "github.com/openshift/api/operator/v1"
16-
"github.com/openshift/library-go/pkg/operator/v1helpers"
17-
1815
"github.com/openshift/library-go/pkg/controller/factory"
1916
"github.com/openshift/library-go/pkg/operator/events"
17+
"github.com/openshift/library-go/pkg/operator/v1helpers"
2018
)
2119

2220
func newSyncContext(t *testing.T) factory.SyncContext {
@@ -102,73 +100,6 @@ func TestEndpointAccessibleController_sync(t *testing.T) {
102100
}
103101
}
104102

105-
// TestEndpointAccessibleController_sync_retry verifies the retry logic for
106-
// fast (non-timeout) failures: the controller sleeps for retryInterval between
107-
// attempts, and either recovers or gives up after attemptCount tries.
108-
func TestEndpointAccessibleController_sync_retry(t *testing.T) {
109-
const attemptCount = 3
110-
111-
tests := []struct {
112-
name string
113-
failFirstN int32 // how many initial requests the server should reject with 500
114-
wantErr bool
115-
}{
116-
{
117-
name: "succeeds on last attempt",
118-
failFirstN: 2,
119-
wantErr: false,
120-
},
121-
{
122-
name: "fails after all attempts exhausted",
123-
failFirstN: 3,
124-
wantErr: true,
125-
},
126-
}
127-
for _, tt := range tests {
128-
t.Run(tt.name, func(t *testing.T) {
129-
syncCtx := newSyncContext(t)
130-
synctest.Test(t, func(t *testing.T) {
131-
c := &endpointAccessibleController{
132-
operatorClient: v1helpers.NewFakeOperatorClient(&operatorv1.OperatorSpec{}, &operatorv1.OperatorStatus{}, nil),
133-
endpointListFn: func() ([]string, error) {
134-
return []string{"http://example.com"}, nil
135-
},
136-
httpClient: &http.Client{Transport: &failFastTransport{maxFails: tt.failFirstN}},
137-
requestTimeout: defaultRequestTimeout,
138-
retryInterval: 10 * time.Second,
139-
attemptCount: attemptCount,
140-
}
141-
142-
start := time.Now()
143-
done := make(chan error, 1)
144-
go func() {
145-
done <- c.sync(context.Background(), syncCtx)
146-
}()
147-
148-
// Advance time for each backoff sleep between attempts.
149-
backoffs := min(int(tt.failFirstN), attemptCount-1)
150-
for range backoffs {
151-
synctest.Wait()
152-
time.Sleep(c.retryInterval + time.Millisecond)
153-
}
154-
synctest.Wait()
155-
156-
err := <-done
157-
if (err != nil) != tt.wantErr {
158-
t.Errorf("sync() error = %v, wantErr %v", err, tt.wantErr)
159-
}
160-
161-
// Verify that each retry used the backoff sleep.
162-
elapsed := time.Since(start)
163-
expectedBackoff := time.Duration(backoffs) * c.retryInterval
164-
if elapsed < expectedBackoff {
165-
t.Errorf("elapsed %v < %v; backoff was skipped for fast failures", elapsed, expectedBackoff)
166-
}
167-
})
168-
})
169-
}
170-
}
171-
172103
// TestEndpointAccessibleController_sync_retryStaleEndpoint verifies that
173104
// endpointListFn is re-invoked on each retry attempt. This covers the upgrade
174105
// scenario where Endpoints/EndpointSlices briefly contain a stale pod IP: the
@@ -212,100 +143,3 @@ func TestEndpointAccessibleController_sync_retryStaleEndpoint(t *testing.T) {
212143
}
213144
}
214145

215-
// TestEndpointAccessibleController_sync_requestTimeout verifies that the
216-
// per-request timeout is enforced, that the retry mechanism handles timed-out
217-
// requests correctly, and that the backoff sleep is skipped after timeouts
218-
// (since the requestTimeout already provided sufficient delay).
219-
func TestEndpointAccessibleController_sync_requestTimeout(t *testing.T) {
220-
const attemptCount = 3
221-
222-
tests := []struct {
223-
name string
224-
hangCount int32 // requests that time out before one succeeds
225-
wantErr bool
226-
}{
227-
{
228-
name: "succeeds after one timed-out retry",
229-
hangCount: 1,
230-
wantErr: false,
231-
},
232-
{
233-
name: "fails after all retries time out",
234-
hangCount: 3,
235-
wantErr: true,
236-
},
237-
}
238-
for _, tt := range tests {
239-
t.Run(tt.name, func(t *testing.T) {
240-
// Create the sync context outside the bubble: factory.NewSyncContext spawns
241-
// a background work-queue goroutine that never exits on its own, which would
242-
// deadlock the synctest bubble.
243-
syncCtx := newSyncContext(t)
244-
synctest.Test(t, func(t *testing.T) {
245-
c := &endpointAccessibleController{
246-
operatorClient: v1helpers.NewFakeOperatorClient(&operatorv1.OperatorSpec{}, &operatorv1.OperatorStatus{}, nil),
247-
endpointListFn: func() ([]string, error) {
248-
return []string{"http://example.com"}, nil
249-
},
250-
httpClient: &http.Client{Transport: &hangingTransport{maxHangs: tt.hangCount}},
251-
requestTimeout: defaultRequestTimeout,
252-
retryInterval: 10 * time.Second, // large — would be visible if not skipped
253-
attemptCount: attemptCount,
254-
}
255-
256-
start := time.Now()
257-
done := make(chan error, 1)
258-
go func() {
259-
done <- c.sync(context.Background(), syncCtx)
260-
}()
261-
262-
for range tt.hangCount {
263-
synctest.Wait()
264-
time.Sleep(c.requestTimeout + time.Millisecond)
265-
}
266-
synctest.Wait()
267-
268-
err := <-done
269-
if (err != nil) != tt.wantErr {
270-
t.Errorf("sync() error = %v, wantErr %v", err, tt.wantErr)
271-
}
272-
273-
// Elapsed time should be only hangCount * requestTimeout with no
274-
// retryInterval added — backoff is skipped after timeouts.
275-
elapsed := time.Since(start)
276-
maxExpected := time.Duration(tt.hangCount)*(c.requestTimeout+time.Millisecond) + time.Second
277-
if elapsed > maxExpected {
278-
t.Errorf("elapsed %v exceeds %v; backoff sleep was not skipped after timeout", elapsed, maxExpected)
279-
}
280-
})
281-
})
282-
}
283-
}
284-
285-
// hangingTransport simulates a slow endpoint: the first maxHangs requests block
286-
// until their context is canceled; subsequent requests succeed immediately with 200 OK.
287-
type hangingTransport struct {
288-
count atomic.Int32
289-
maxHangs int32
290-
}
291-
292-
func (h *hangingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
293-
if h.count.Add(1) <= h.maxHangs {
294-
<-req.Context().Done()
295-
return nil, req.Context().Err()
296-
}
297-
return &http.Response{StatusCode: http.StatusOK, Body: http.NoBody}, nil
298-
}
299-
300-
// failFastTransport returns 500 for the first maxFails requests, then 200.
301-
type failFastTransport struct {
302-
count atomic.Int32
303-
maxFails int32
304-
}
305-
306-
func (f *failFastTransport) RoundTrip(req *http.Request) (*http.Response, error) {
307-
if f.count.Add(1) <= f.maxFails {
308-
return &http.Response{StatusCode: http.StatusInternalServerError, Body: http.NoBody}, nil
309-
}
310-
return &http.Response{StatusCode: http.StatusOK, Body: http.NoBody}, nil
311-
}

pkg/libs/endpointcheck/check.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// Package endpointcheck provides a retry-with-backoff loop for endpoint
2+
// health checks. It skips the backoff sleep after context.DeadlineExceeded
3+
// errors (since the request timeout already provided sufficient delay) and
4+
// stops immediately on backoff.Permanent errors.
5+
package endpointcheck
6+
7+
import (
8+
"context"
9+
"errors"
10+
"time"
11+
12+
"github.com/cenkalti/backoff/v4"
13+
14+
"github.com/openshift/library-go/pkg/operator/resource/retry"
15+
)
16+
17+
const (
18+
DefaultRequestTimeout = 5 * time.Second
19+
DefaultRetryInterval = 2 * time.Second
20+
DefaultAttemptCount = 3
21+
)
22+
23+
// CheckFunc is called on each attempt with the parent context and the
24+
// per-request timeout. Return nil on success, a regular error to trigger
25+
// a retry, or backoff.Permanent(err) to abort immediately.
26+
type CheckFunc func(ctx context.Context, requestTimeout time.Duration) error
27+
28+
// Check runs checkFn up to attemptCount times, sleeping retryInterval between
29+
// attempts. Zero-valued parameters fall back to their Default* constants.
30+
// If checkFn returns context.DeadlineExceeded the next backoff sleep is
31+
// skipped, since the request timeout already consumed enough wall-clock time.
32+
func Check(ctx context.Context, requestTimeout time.Duration, retryInterval time.Duration, attemptCount uint64, checkFn CheckFunc) error {
33+
if requestTimeout == 0 {
34+
requestTimeout = DefaultRequestTimeout
35+
}
36+
if retryInterval == 0 {
37+
retryInterval = DefaultRetryInterval
38+
}
39+
if attemptCount == 0 {
40+
attemptCount = DefaultAttemptCount
41+
}
42+
43+
// Run checkFn given number of times. Getting a timeout from checkFn causes an immediate retry,
44+
// we don't wait another retryInterval before performing the next check.
45+
skippableBoff := retry.NewSkippableBackOff(backoff.NewConstantBackOff(retryInterval))
46+
boff := backoff.WithContext(backoff.WithMaxRetries(skippableBoff, attemptCount-1), ctx)
47+
return backoff.Retry(func() error {
48+
err := checkFn(ctx, requestTimeout)
49+
if errors.Is(err, context.DeadlineExceeded) {
50+
skippableBoff.SkipNext()
51+
}
52+
return err
53+
}, boff)
54+
}

0 commit comments

Comments
 (0)