From 7d1f7771c9961dc6607a6ac5894b9c3195c25fcf Mon Sep 17 00:00:00 2001 From: Francis Lavoie Date: Wed, 13 Jul 2022 16:15:00 -0400 Subject: reverseproxy: Implement retry count, alternative to try_duration (#4756) * reverseproxy: Implement retry count, alternative to try_duration * Add Caddyfile support for `retry_match` * Refactor to deduplicate matcher parsing logic * Fix lint --- modules/caddyhttp/reverseproxy/caddyfile.go | 25 ++++++++++ modules/caddyhttp/reverseproxy/reverseproxy.go | 66 ++++++++++++++++++-------- 2 files changed, 71 insertions(+), 20 deletions(-) (limited to 'modules/caddyhttp/reverseproxy') diff --git a/modules/caddyhttp/reverseproxy/caddyfile.go b/modules/caddyhttp/reverseproxy/caddyfile.go index 4fa4be0..a5321d1 100644 --- a/modules/caddyhttp/reverseproxy/caddyfile.go +++ b/modules/caddyhttp/reverseproxy/caddyfile.go @@ -59,8 +59,10 @@ func parseCaddyfile(h httpcaddyfile.Helper) (caddyhttp.MiddlewareHandler, error) // // # load balancing // lb_policy [] +// lb_retries // lb_try_duration // lb_try_interval +// lb_retry_match // // # active health checking // health_uri @@ -247,6 +249,19 @@ func (h *Handler) UnmarshalCaddyfile(d *caddyfile.Dispenser) error { } h.LoadBalancing.SelectionPolicyRaw = caddyconfig.JSONModuleObject(sel, "policy", name, nil) + case "lb_retries": + if !d.NextArg() { + return d.ArgErr() + } + tries, err := strconv.Atoi(d.Val()) + if err != nil { + return d.Errf("bad lb_retries number '%s': %v", d.Val(), err) + } + if h.LoadBalancing == nil { + h.LoadBalancing = new(LoadBalancing) + } + h.LoadBalancing.Retries = tries + case "lb_try_duration": if !d.NextArg() { return d.ArgErr() @@ -273,6 +288,16 @@ func (h *Handler) UnmarshalCaddyfile(d *caddyfile.Dispenser) error { } h.LoadBalancing.TryInterval = caddy.Duration(dur) + case "lb_retry_match": + matcherSet, err := caddyhttp.ParseCaddyfileNestedMatcherSet(d) + if err != nil { + return d.Errf("failed to parse lb_retry_match: %v", err) + } + if h.LoadBalancing == nil { + h.LoadBalancing = new(LoadBalancing) + } + h.LoadBalancing.RetryMatchRaw = append(h.LoadBalancing.RetryMatchRaw, matcherSet) + case "health_uri": if !d.NextArg() { return d.ArgErr() diff --git a/modules/caddyhttp/reverseproxy/reverseproxy.go b/modules/caddyhttp/reverseproxy/reverseproxy.go index 64adce2..7061275 100644 --- a/modules/caddyhttp/reverseproxy/reverseproxy.go +++ b/modules/caddyhttp/reverseproxy/reverseproxy.go @@ -430,12 +430,14 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request, next caddyht // and because we may retry some number of times, carry over the error // from previous tries because of the nuances of load balancing & retries var proxyErr error + var retries int for { var done bool - done, proxyErr = h.proxyLoopIteration(clonedReq, r, w, proxyErr, start, repl, reqHeader, reqHost, next) + done, proxyErr = h.proxyLoopIteration(clonedReq, r, w, proxyErr, start, retries, repl, reqHeader, reqHost, next) if done { break } + retries++ } if proxyErr != nil { @@ -449,7 +451,7 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request, next caddyht // that has to be passed in, we brought this into its own method so that we could run defer more easily. // It returns true when the loop is done and should break; false otherwise. The error value returned should // be assigned to the proxyErr value for the next iteration of the loop (or the error handled after break). -func (h *Handler) proxyLoopIteration(r *http.Request, origReq *http.Request, w http.ResponseWriter, proxyErr error, start time.Time, +func (h *Handler) proxyLoopIteration(r *http.Request, origReq *http.Request, w http.ResponseWriter, proxyErr error, start time.Time, retries int, repl *caddy.Replacer, reqHeader http.Header, reqHost string, next caddyhttp.Handler) (bool, error) { // get the updated list of upstreams upstreams := h.Upstreams @@ -479,7 +481,7 @@ func (h *Handler) proxyLoopIteration(r *http.Request, origReq *http.Request, w h if proxyErr == nil { proxyErr = caddyhttp.Error(http.StatusServiceUnavailable, fmt.Errorf("no upstreams available")) } - if !h.LoadBalancing.tryAgain(h.ctx, start, proxyErr, r) { + if !h.LoadBalancing.tryAgain(h.ctx, start, retries, proxyErr, r) { return true, proxyErr } return false, proxyErr @@ -542,7 +544,7 @@ func (h *Handler) proxyLoopIteration(r *http.Request, origReq *http.Request, w h h.countFailure(upstream) // if we've tried long enough, break - if !h.LoadBalancing.tryAgain(h.ctx, start, proxyErr, r) { + if !h.LoadBalancing.tryAgain(h.ctx, start, retries, proxyErr, r) { return true, proxyErr } @@ -944,16 +946,26 @@ func (h Handler) finalizeResponse( return nil } -// tryAgain takes the time that the handler was initially invoked -// as well as any error currently obtained, and the request being -// tried, and returns true if another attempt should be made at -// proxying the request. If true is returned, it has already blocked -// long enough before the next retry (i.e. no more sleeping is -// needed). If false is returned, the handler should stop trying to -// proxy the request. -func (lb LoadBalancing) tryAgain(ctx caddy.Context, start time.Time, proxyErr error, req *http.Request) bool { +// tryAgain takes the time that the handler was initially invoked, +// the amount of retries already performed, as well as any error +// currently obtained, and the request being tried, and returns +// true if another attempt should be made at proxying the request. +// If true is returned, it has already blocked long enough before +// the next retry (i.e. no more sleeping is needed). If false is +// returned, the handler should stop trying to proxy the request. +func (lb LoadBalancing) tryAgain(ctx caddy.Context, start time.Time, retries int, proxyErr error, req *http.Request) bool { + // no retries are configured + if lb.TryDuration == 0 && lb.Retries == 0 { + return false + } + // if we've tried long enough, break - if time.Since(start) >= time.Duration(lb.TryDuration) { + if lb.TryDuration > 0 && time.Since(start) >= time.Duration(lb.TryDuration) { + return false + } + + // if we've reached the retry limit, break + if lb.Retries > 0 && retries >= lb.Retries { return false } @@ -976,6 +988,11 @@ func (lb LoadBalancing) tryAgain(ctx caddy.Context, start time.Time, proxyErr er } } + // fast path; if the interval is zero, we don't need to wait + if lb.TryInterval == 0 { + return true + } + // otherwise, wait and try the next available host timer := time.NewTimer(time.Duration(lb.TryInterval)) select { @@ -1190,16 +1207,25 @@ type LoadBalancing struct { // The default policy is random selection. SelectionPolicyRaw json.RawMessage `json:"selection_policy,omitempty" caddy:"namespace=http.reverse_proxy.selection_policies inline_key=policy"` + // How many times to retry selecting available backends for each + // request if the next available host is down. If try_duration is + // also configured, then retries may stop early if the duration + // is reached. By default, retries are disabled (zero). + Retries int `json:"retries,omitempty"` + // How long to try selecting available backends for each request - // if the next available host is down. By default, this retry is - // disabled. Clients will wait for up to this long while the load - // balancer tries to find an available upstream host. + // if the next available host is down. Clients will wait for up + // to this long while the load balancer tries to find an available + // upstream host. If retries is also configured, tries may stop + // early if the maximum retries is reached. By default, retries + // are disabled (zero duration). TryDuration caddy.Duration `json:"try_duration,omitempty"` - // How long to wait between selecting the next host from the pool. Default - // is 250ms. Only relevant when a request to an upstream host fails. Be - // aware that setting this to 0 with a non-zero try_duration can cause the - // CPU to spin if all backends are down and latency is very low. + // How long to wait between selecting the next host from the pool. + // Default is 250ms if try_duration is enabled, otherwise zero. Only + // relevant when a request to an upstream host fails. Be aware that + // setting this to 0 with a non-zero try_duration can cause the CPU + // to spin if all backends are down and latency is very low. TryInterval caddy.Duration `json:"try_interval,omitempty"` // A list of matcher sets that restricts with which requests retries are -- cgit v1.2.3