Skip to content
This repository was archived by the owner on Apr 18, 2024. It is now read-only.

Commit a518e61

Browse files
committed
refactor: RetryAfter and CID cooldown
- human-readable retry-after errors - adjusted CID cooldown/failure cache to minimize end user impact (1m delay is not the best, but way better than 5m which is plain user hostile – context: ipni/storetheindex#1344) - easier to read and reason about durations, prepare for upstream support of RetryAfter erroris in go-libipfs/gateway
1 parent 30711cb commit a518e61

File tree

3 files changed

+30
-23
lines changed

3 files changed

+30
-23
lines changed

caboose.go

+16-8
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,10 @@ const DefaultPoolRefreshInterval = 5 * time.Minute
8383

8484
// we cool off sending requests to Saturn for a cid for a certain duration
8585
// if we've seen a certain number of failures for it already in a given duration.
86-
const DefaultMaxCidFailures = 3
87-
const DefaultCidCoolDownDuration = 5 * time.Minute
86+
// NOTE: before getting creative here, make sure you dont break end user flow
87+
// described in https://github.com/ipni/storetheindex/pull/1344
88+
const DefaultMaxCidFailures = 3 * DefaultMaxRetries // this has to fail more than DefaultMaxRetries done for a single gateway request
89+
const DefaultCidCoolDownDuration = 1 * time.Minute // how long will a sane person wait and stare at blank screen with "retry later" error before hitting F5?
8890

8991
// we cool off sending requests to a Saturn node if it returns transient errors rather than immediately downvoting it;
9092
// however, only upto a certain max number of cool-offs.
@@ -98,21 +100,27 @@ var ErrContentProviderNotFound error = errors.New("saturn failed to find content
98100
var ErrSaturnTimeout error = errors.New("saturn backend timed out")
99101

100102
type ErrSaturnTooManyRequests struct {
101-
Node string
102-
RetryAfterMs int64
103+
Node string
104+
RetryAfter time.Duration // TODO: DRY refactor after https://github.com/ipfs/go-libipfs/issues/188
103105
}
104106

105107
func (e ErrSaturnTooManyRequests) Error() string {
106-
return fmt.Sprintf("saturn node %s returned `too many requests` error, please retry after %d milliseconds", e.Node, e.RetryAfterMs)
108+
return fmt.Sprintf("saturn node %s returned Too Many Requests error, please retry after %s", e.Node, humanRetry(e.RetryAfter))
107109
}
108110

109111
type ErrCidCoolDown struct {
110-
Cid cid.Cid
111-
RetryAfterMs int64
112+
Cid cid.Cid
113+
RetryAfter time.Duration // TODO: DRY refactor after https://github.com/ipfs/go-libipfs/issues/188
112114
}
113115

114116
func (e *ErrCidCoolDown) Error() string {
115-
return fmt.Sprintf("multiple saturn retrieval failures seen for CID %s, please retry after %d milliseconds", e.Cid, e.RetryAfterMs)
117+
return fmt.Sprintf("multiple saturn retrieval failures seen for CID %s, please retry after %s", e.Cid, humanRetry(e.RetryAfter))
118+
}
119+
120+
// TODO: move this to upstream error interface in https://github.com/ipfs/go-libipfs/issues/188
121+
// and refactor ErrCidCoolDown and ErrSaturnTooManyRequests to inherit from that instead
122+
func humanRetry(d time.Duration) string {
123+
return d.Truncate(time.Second).String()
116124
}
117125

118126
type Caboose struct {

failure_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ import (
1616
)
1717

1818
var maxCabooseWeight = 20
19-
var retryAfterMs = 1000
19+
var expRetryAfter = 1 * time.Second
2020

2121
func TestHttp429(t *testing.T) {
2222
ctx := context.Background()
@@ -31,7 +31,7 @@ func TestHttp429(t *testing.T) {
3131
require.Error(t, err)
3232

3333
ferr := err.(*caboose.ErrSaturnTooManyRequests)
34-
require.EqualValues(t, retryAfterMs, ferr.RetryAfterMs)
34+
require.EqualValues(t, expRetryAfter, ferr.RetryAfter)
3535
}
3636

3737
func TestCabooseTransientFailures(t *testing.T) {
@@ -165,7 +165,7 @@ func (ch *CabooseHarness) fetchAndAssertCoolDownError(t *testing.T, ctx context.
165165
coolDownErr, ok := err.(*caboose.ErrCidCoolDown)
166166
require.True(t, ok)
167167
require.EqualValues(t, cid, coolDownErr.Cid)
168-
require.NotZero(t, coolDownErr.RetryAfterMs)
168+
require.NotZero(t, coolDownErr.RetryAfter)
169169
}
170170

171171
func (ch *CabooseHarness) fetchAndAssertFailure(t *testing.T, ctx context.Context, testCid cid.Cid, contains string) {

pool.go

+11-12
Original file line numberDiff line numberDiff line change
@@ -285,8 +285,8 @@ func (p *pool) fetchWith(ctx context.Context, c cid.Cid, with string) (blk block
285285

286286
expireAt := at.(time.Time)
287287
return nil, &ErrCidCoolDown{
288-
Cid: c,
289-
RetryAfterMs: time.Until(expireAt).Milliseconds(),
288+
Cid: c,
289+
RetryAfter: time.Until(expireAt),
290290
}
291291
}
292292
p.cidLk.RUnlock()
@@ -438,8 +438,8 @@ func (p *pool) fetchAndUpdate(ctx context.Context, node string, c cid.Cid, attem
438438
var errTooManyRequests ErrSaturnTooManyRequests
439439
if errors.As(err, &errTooManyRequests) {
440440
err = &ErrSaturnTooManyRequests{
441-
Node: errTooManyRequests.Node,
442-
RetryAfterMs: errTooManyRequests.RetryAfterMs,
441+
Node: errTooManyRequests.Node,
442+
RetryAfter: errTooManyRequests.RetryAfter,
443443
}
444444
if ok := p.isCoolOffLocked(node); ok {
445445
return
@@ -642,20 +642,19 @@ func (p *pool) doFetch(ctx context.Context, from string, c cid.Cid, attempt int)
642642

643643
if resp.StatusCode != http.StatusOK {
644644
if resp.StatusCode == http.StatusTooManyRequests {
645-
rs := int64(0)
646-
retryAfter := resp.Header.Get(saturnRetryAfterKey)
647-
if len(retryAfter) != 0 {
648-
seconds, err := strconv.ParseInt(retryAfter, 10, 64)
645+
var retryAfter time.Duration
646+
if strnRetryHint := resp.Header.Get(saturnRetryAfterKey); strnRetryHint != "" {
647+
seconds, err := strconv.ParseInt(strnRetryHint, 10, 64)
649648
if err == nil {
650-
rs = seconds * 1000
649+
retryAfter = time.Duration(seconds) * time.Second
651650
}
652651
}
653652

654-
if rs == 0 {
655-
rs = p.config.SaturnNodeCoolOff.Milliseconds()
653+
if retryAfter == 0 {
654+
retryAfter = p.config.SaturnNodeCoolOff
656655
}
657656

658-
return nil, fmt.Errorf("http error from strn: %d, err=%w", resp.StatusCode, ErrSaturnTooManyRequests{RetryAfterMs: rs, Node: from})
657+
return nil, fmt.Errorf("http error from strn: %d, err=%w", resp.StatusCode, ErrSaturnTooManyRequests{RetryAfter: retryAfter, Node: from})
659658
}
660659

661660
if resp.StatusCode == http.StatusGatewayTimeout {

0 commit comments

Comments
 (0)