summaryrefslogtreecommitdiff
path: root/modules
diff options
context:
space:
mode:
authorMatthew Holt <mholt@users.noreply.github.com>2020-12-02 13:26:28 -0700
committerMatthew Holt <mholt@users.noreply.github.com>2020-12-02 13:26:28 -0700
commit9157051f45b8c1354b6c8432457ca4930ba90d9e (patch)
treeb92f1a6bcd1057be76ca173417976a4687010811 /modules
parent4cff36d731390915649261f0e9c088be0eeafcf1 (diff)
caddyhttp: Optimize large host matchers
Diffstat (limited to 'modules')
-rw-r--r--modules/caddyhttp/matchers.go68
-rw-r--r--modules/caddyhttp/matchers_test.go27
2 files changed, 95 insertions, 0 deletions
diff --git a/modules/caddyhttp/matchers.go b/modules/caddyhttp/matchers.go
index d641cb6..c5cf21d 100644
--- a/modules/caddyhttp/matchers.go
+++ b/modules/caddyhttp/matchers.go
@@ -23,6 +23,7 @@ import (
"net/url"
"path/filepath"
"regexp"
+ "sort"
"strings"
"github.com/caddyserver/caddy/v2"
@@ -51,6 +52,8 @@ type (
//
// The wildcard can be useful for matching all subdomains, for example:
// `*.example.com` matches `foo.example.com` but not `foo.bar.example.com`.
+ //
+ // Duplicate entries will return an error.
MatchHost []string
// MatchPath matches requests by the URI's path (case-insensitive). Path
@@ -167,6 +170,40 @@ func (m *MatchHost) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {
return nil
}
+// Provision sets up and validates m, including making it more efficient for large lists.
+func (m MatchHost) Provision(_ caddy.Context) error {
+ // check for duplicates; they are nonsensical and reduce efficiency
+ // (we could just remove them, but the user should know their config is erroneous)
+ seen := make(map[string]int)
+ for i, h := range m {
+ h = strings.ToLower(h)
+ if firstI, ok := seen[h]; ok {
+ return fmt.Errorf("host at index %d is repeated at index %d: %s", firstI, i, h)
+ }
+ seen[h] = i
+ }
+
+ if m.large() {
+ // sort the slice lexicographically, grouping "fuzzy" entries (wildcards and placeholders)
+ // at the front of the list; this allows us to use binary search for exact matches, which
+ // we have seen from experience is the most common kind of value in large lists; and any
+ // other kinds of values (wildcards and placeholders) are grouped in front so the linear
+ // search should find a match fairly quickly
+ sort.Slice(m, func(i, j int) bool {
+ iInexact, jInexact := m.fuzzy(m[i]), m.fuzzy(m[j])
+ if iInexact && !jInexact {
+ return true
+ }
+ if !iInexact && jInexact {
+ return false
+ }
+ return m[i] < m[j]
+ })
+ }
+
+ return nil
+}
+
// Match returns true if r matches m.
func (m MatchHost) Match(r *http.Request) bool {
reqHost, _, err := net.SplitHostPort(r.Host)
@@ -179,10 +216,31 @@ func (m MatchHost) Match(r *http.Request) bool {
reqHost = strings.TrimSuffix(reqHost, "]")
}
+ if m.large() {
+ // fast path: locate exact match using binary search (about 100-1000x faster for large lists)
+ pos := sort.Search(len(m), func(i int) bool {
+ if m.fuzzy(m[i]) {
+ return false
+ }
+ return m[i] >= reqHost
+ })
+ if pos < len(m) && m[pos] == reqHost {
+ return true
+ }
+ }
+
repl := r.Context().Value(caddy.ReplacerCtxKey).(*caddy.Replacer)
outer:
for _, host := range m {
+ // fast path: if matcher is large, we already know we don't have an exact
+ // match, so we're only looking for fuzzy match now, which should be at the
+ // front of the list; if we have reached a value that is not fuzzy, there
+ // will be no match and we can short-circuit for efficiency
+ if m.large() && !m.fuzzy(host) {
+ break
+ }
+
host = repl.ReplaceAll(host, "")
if strings.Contains(host, "*") {
patternParts := strings.Split(host, ".")
@@ -207,6 +265,15 @@ outer:
return false
}
+// fuzzy returns true if the given hostname h is not a specific
+// hostname, e.g. has placeholders or wildcards.
+func (MatchHost) fuzzy(h string) bool { return strings.ContainsAny(h, "{*") }
+
+// large returns true if m is considered to be large. Optimizing
+// the matcher for smaller lists has diminishing returns.
+// See related benchmark function in test file to conduct experiments.
+func (m MatchHost) large() bool { return len(m) > 100 }
+
// CaddyModule returns the Caddy module information.
func (MatchPath) CaddyModule() caddy.ModuleInfo {
return caddy.ModuleInfo{
@@ -909,6 +976,7 @@ const regexpPlaceholderPrefix = "http.regexp"
// Interface guards
var (
_ RequestMatcher = (*MatchHost)(nil)
+ _ caddy.Provisioner = (*MatchHost)(nil)
_ RequestMatcher = (*MatchPath)(nil)
_ RequestMatcher = (*MatchPathRE)(nil)
_ caddy.Provisioner = (*MatchPathRE)(nil)
diff --git a/modules/caddyhttp/matchers_test.go b/modules/caddyhttp/matchers_test.go
index 51db67d..ab4487d 100644
--- a/modules/caddyhttp/matchers_test.go
+++ b/modules/caddyhttp/matchers_test.go
@@ -1018,6 +1018,33 @@ func TestNotMatcher(t *testing.T) {
}
}
}
+func BenchmarkLargeHostMatcher(b *testing.B) {
+ // this benchmark simulates a large host matcher (thousands of entries) where each
+ // value is an exact hostname (not a placeholder or wildcard) - compare the results
+ // of this with and without the binary search (comment out the various fast path
+ // sections in Match) to conduct experiments
+
+ const n = 10000
+ lastHost := fmt.Sprintf("%d.example.com", n-1)
+ req := &http.Request{Host: lastHost}
+ repl := caddy.NewReplacer()
+ ctx := context.WithValue(req.Context(), caddy.ReplacerCtxKey, repl)
+ req = req.WithContext(ctx)
+
+ matcher := make(MatchHost, n)
+ for i := 0; i < n; i++ {
+ matcher[i] = fmt.Sprintf("%d.example.com", i)
+ }
+ err := matcher.Provision(caddy.Context{})
+ if err != nil {
+ b.Fatal(err)
+ }
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ matcher.Match(req)
+ }
+}
func BenchmarkHostMatcherWithoutPlaceholder(b *testing.B) {
req := &http.Request{Host: "localhost"}