summaryrefslogtreecommitdiff
path: root/modules/caddyhttp/matchers.go
diff options
context:
space:
mode:
authorMatt Holt <mholt@users.noreply.github.com>2022-08-16 08:48:57 -0600
committerGitHub <noreply@github.com>2022-08-16 08:48:57 -0600
commita479943acd70068c4b80d3a8f4b8dd7ab93ca2ba (patch)
treef50a45b3b0c8c8475b783583967c1175f5e6673c /modules/caddyhttp/matchers.go
parentdc62d468e9645f52a5e1b4f6093dff65137ab3fe (diff)
caddyhttp: Smarter path matching and rewriting (#4948)
Co-authored-by: RussellLuo <luopeng.he@gmail.com>
Diffstat (limited to 'modules/caddyhttp/matchers.go')
-rw-r--r--modules/caddyhttp/matchers.go318
1 files changed, 241 insertions, 77 deletions
diff --git a/modules/caddyhttp/matchers.go b/modules/caddyhttp/matchers.go
index bb957d6..d6bbe7e 100644
--- a/modules/caddyhttp/matchers.go
+++ b/modules/caddyhttp/matchers.go
@@ -23,7 +23,6 @@ import (
"net/textproto"
"net/url"
"path"
- "path/filepath"
"reflect"
"regexp"
"sort"
@@ -63,20 +62,51 @@ type (
// Duplicate entries will return an error.
MatchHost []string
- // MatchPath matches requests by the URI's path (case-insensitive). Path
- // matches are exact, but wildcards may be used:
+ // MatchPath case-insensitively matches requests by the URI's path. Path
+ // matching is exact, not prefix-based, giving you more control and clarity
+ // over matching. Wildcards (`*`) may be used:
//
- // - At the end, for a prefix match (`/prefix/*`)
- // - At the beginning, for a suffix match (`*.suffix`)
- // - On both sides, for a substring match (`*/contains/*`)
+ // - At the end only, for a prefix match (`/prefix/*`)
+ // - At the beginning only, for a suffix match (`*.suffix`)
+ // - On both sides only, for a substring match (`*/contains/*`)
// - In the middle, for a globular match (`/accounts/*/info`)
//
+ // Slashes are significant; i.e. `/foo*` matches `/foo`, `/foo/`, `/foo/bar`,
+ // and `/foobar`; but `/foo/*` does not match `/foo` or `/foobar`. Valid
+ // paths start with a slash `/`.
+ //
+ // Because there are, in general, multiple possible escaped forms of any
+ // path, path matchers operate in unescaped space; that is, path matchers
+ // should be written in their unescaped form to prevent ambiguities and
+ // possible security issues, as all request paths will be normalized to
+ // their unescaped forms before matcher evaluation.
+ //
+ // However, escape sequences in a match pattern are supported; they are
+ // compared with the request's raw/escaped path for those bytes only.
+ // In other words, a matcher of `/foo%2Fbar` will match a request path
+ // of precisely `/foo%2Fbar`, but not `/foo/bar`. It follows that matching
+ // the literal percent sign (%) in normalized space can be done using the
+ // escaped form, `%25`.
+ //
+ // Even though wildcards (`*`) operate in the normalized space, the special
+ // escaped wildcard (`%*`), which is not a valid escape sequence, may be
+ // used in place of a span that should NOT be decoded; that is, `/bands/%*`
+ // will match `/bands/AC%2fDC` whereas `/bands/*` will not.
+ //
+ // Even though path matching is done in normalized space, the special
+ // wildcard `%*` may be used in place of a span that should NOT be decoded;
+ // that is, `/bands/%*/` will match `/bands/AC%2fDC/` whereas `/bands/*/`
+ // will not.
+ //
// This matcher is fast, so it does not support regular expressions or
// capture groups. For slower but more powerful matching, use the
- // path_regexp matcher.
+ // path_regexp matcher. (Note that due to the special treatment of
+ // escape sequences in matcher patterns, they may perform slightly slower
+ // in high-traffic environments.)
MatchPath []string
// MatchPathRE matches requests by a regular expression on the URI's path.
+ // Path matching is performed in the unescaped (decoded) form of the path.
//
// Upon a match, it adds placeholders to the request: `{http.regexp.name.capture_group}`
// where `name` is the regular expression's name, and `capture_group` is either
@@ -303,7 +333,8 @@ outer:
// expression matchers.
//
// Example:
-// expression host('localhost')
+//
+// expression host('localhost')
func (MatchHost) CELLibrary(ctx caddy.Context) (cel.Library, error) {
return CELMatcherImpl(
"host",
@@ -342,6 +373,7 @@ func (MatchPath) CaddyModule() caddy.ModuleInfo {
// Provision lower-cases the paths in m to ensure case-insensitive matching.
func (m MatchPath) Provision(_ caddy.Context) error {
for i := range m {
+ // TODO: if m[i] == "*", put it first and delete all others (will always match)
m[i] = strings.ToLower(m[i])
}
return nil
@@ -349,77 +381,108 @@ func (m MatchPath) Provision(_ caddy.Context) error {
// Match returns true if r matches m.
func (m MatchPath) Match(r *http.Request) bool {
- // PathUnescape returns an error if the escapes aren't
- // well-formed, meaning the count % matches the RFC.
- // Return early if the escape is improper.
- unescapedPath, err := url.PathUnescape(r.URL.Path)
- if err != nil {
- return false
- }
-
- lowerPath := strings.ToLower(unescapedPath)
-
- // Clean the path, merges doubled slashes, etc.
- // This ensures maliciously crafted requests can't bypass
- // the path matcher. See #4407
- lowerPath = path.Clean(lowerPath)
-
- // see #2917; Windows ignores trailing dots and spaces
+ // Even though RFC 9110 says that path matching is case-sensitive
+ // (https://www.rfc-editor.org/rfc/rfc9110.html#section-4.2.3),
+ // we do case-insensitive matching to mitigate security issues
+ // related to differences between operating systems, applications,
+ // etc; if case-sensitive matching is needed, the regex matcher
+ // can be used instead.
+ reqPath := strings.ToLower(r.URL.Path)
+
+ // See #2917; Windows ignores trailing dots and spaces
// when accessing files (sigh), potentially causing a
// security risk (cry) if PHP files end up being served
// as static files, exposing the source code, instead of
- // being matched by *.php to be treated as PHP scripts
- lowerPath = strings.TrimRight(lowerPath, ". ")
-
- // Cleaning may remove the trailing slash, but we want to keep it
- if lowerPath != "/" && strings.HasSuffix(r.URL.Path, "/") {
- lowerPath = lowerPath + "/"
- }
+ // being matched by *.php to be treated as PHP scripts.
+ reqPath = strings.TrimRight(reqPath, ". ")
repl := r.Context().Value(caddy.ReplacerCtxKey).(*caddy.Replacer)
- for _, matchPath := range m {
- matchPath = repl.ReplaceAll(matchPath, "")
+ for _, matchPattern := range m {
+ matchPattern = repl.ReplaceAll(matchPattern, "")
// special case: whole path is wildcard; this is unnecessary
// as it matches all requests, which is the same as no matcher
- if matchPath == "*" {
+ if matchPattern == "*" {
return true
}
- // special case: first and last characters are wildcard,
- // treat it as a fast substring match
- if len(matchPath) > 1 &&
- strings.HasPrefix(matchPath, "*") &&
- strings.HasSuffix(matchPath, "*") {
- if strings.Contains(lowerPath, matchPath[1:len(matchPath)-1]) {
+ // Clean the path, merge doubled slashes, etc.
+ // This ensures maliciously crafted requests can't bypass
+ // the path matcher. See #4407. Good security posture
+ // requires that we should do all we can to reduce any
+ // funny-looking paths into "normalized" forms such that
+ // weird variants can't sneak by.
+ //
+ // How we clean the path depends on the kind of pattern:
+ // we either merge slashes or we don't. If the pattern
+ // has double slashes, we preserve them in the path.
+ //
+ // TODO: Despite the fact that the *vast* majority of path
+ // matchers have only 1 pattern, a possible optimization is
+ // to remember the cleaned form of the path for future
+ // iterations; it's just that the way we clean depends on
+ // the kind of pattern.
+
+ mergeSlashes := !strings.Contains(matchPattern, "//")
+
+ // if '%' appears in the match pattern, we interpret that to mean
+ // the intent is to compare that part of the path in raw/escaped
+ // space; i.e. "%40"=="%40", not "@", and "%2F"=="%2F", not "/"
+ if strings.Contains(matchPattern, "%") {
+ reqPathForPattern := CleanPath(r.URL.EscapedPath(), mergeSlashes)
+ if m.matchPatternWithEscapeSequence(reqPathForPattern, matchPattern) {
return true
}
+
+ // doing prefix/suffix/substring matches doesn't make sense
continue
}
- // special case: first character is a wildcard,
- // treat it as a fast suffix match
- if strings.HasPrefix(matchPath, "*") {
- if strings.HasSuffix(lowerPath, matchPath[1:]) {
+ reqPathForPattern := CleanPath(reqPath, mergeSlashes)
+
+ // for substring, prefix, and suffix matching, only perform those
+ // special, fast matches if they are the only wildcards in the pattern;
+ // otherwise we assume a globular match if any * appears in the middle
+
+ // special case: first and last characters are wildcard,
+ // treat it as a fast substring match
+ if strings.Count(matchPattern, "*") == 2 &&
+ strings.HasPrefix(matchPattern, "*") &&
+ strings.HasSuffix(matchPattern, "*") &&
+ strings.Count(matchPattern, "*") == 2 {
+ if strings.Contains(reqPathForPattern, matchPattern[1:len(matchPattern)-1]) {
return true
}
continue
}
- // special case: last character is a wildcard,
- // treat it as a fast prefix match
- if strings.HasSuffix(matchPath, "*") {
- if strings.HasPrefix(lowerPath, matchPath[:len(matchPath)-1]) {
- return true
+ // only perform prefix/suffix match if it is the only wildcard...
+ // I think that is more correct most of the time
+ if strings.Count(matchPattern, "*") == 1 {
+ // special case: first character is a wildcard,
+ // treat it as a fast suffix match
+ if strings.HasPrefix(matchPattern, "*") {
+ if strings.HasSuffix(reqPathForPattern, matchPattern[1:]) {
+ return true
+ }
+ continue
+ }
+
+ // special case: last character is a wildcard,
+ // treat it as a fast prefix match
+ if strings.HasSuffix(matchPattern, "*") {
+ if strings.HasPrefix(reqPathForPattern, matchPattern[:len(matchPattern)-1]) {
+ return true
+ }
+ continue
}
- continue
}
- // for everything else, try globular matching, which also
- // is exact matching if there are no glob/wildcard chars;
- // can ignore error here because we can't handle it anyway
- matches, _ := filepath.Match(matchPath, lowerPath)
+ // at last, use globular matching, which also is exact matching
+ // if there are no glob/wildcard chars; we ignore the error here
+ // because we can't handle it anyway
+ matches, _ := path.Match(matchPattern, reqPathForPattern)
if matches {
return true
}
@@ -427,11 +490,118 @@ func (m MatchPath) Match(r *http.Request) bool {
return false
}
+func (MatchPath) matchPatternWithEscapeSequence(escapedPath, matchPath string) bool {
+ // We would just compare the pattern against r.URL.Path,
+ // but the pattern contains %, indicating that we should
+ // compare at least some part of the path in raw/escaped
+ // space, not normalized space; so we build the string we
+ // will compare against by adding the normalized parts
+ // of the path, then switching to the escaped parts where
+ // the pattern hints to us wherever % is present.
+ var sb strings.Builder
+
+ // iterate the pattern and escaped path in lock-step;
+ // increment iPattern every time we consume a char from the pattern,
+ // increment iPath every time we consume a char from the path;
+ // iPattern and iPath are our cursors/iterator positions for each string
+ var iPattern, iPath int
+ for {
+ if iPattern >= len(matchPath) || iPath >= len(escapedPath) {
+ break
+ }
+
+ // get the next character from the request path
+
+ pathCh := string(escapedPath[iPath])
+ var escapedPathCh string
+
+ // normalize (decode) escape sequences
+ if pathCh == "%" && len(escapedPath) >= iPath+3 {
+ // hold onto this in case we find out the intent is to match in escaped space here;
+ // we lowercase it even though technically the spec says: "For consistency, URI
+ // producers and normalizers should use uppercase hexadecimal digits for all percent-
+ // encodings" (RFC 3986 section 2.1) - we lowercased the matcher pattern earlier in
+ // provisioning so we do the same here to gain case-insensitivity in equivalence;
+ // besides, this string is never shown visibly
+ escapedPathCh = strings.ToLower(escapedPath[iPath : iPath+3])
+
+ var err error
+ pathCh, err = url.PathUnescape(escapedPathCh)
+ if err != nil {
+ // should be impossible unless EscapedPath() is giving us an invalid sequence!
+ return false
+ }
+ iPath += 2 // escape sequence is 2 bytes longer than normal char
+ }
+
+ // now get the next character from the pattern
+
+ normalize := true
+ switch matchPath[iPattern] {
+ case '%':
+ // escape sequence
+
+ // if not a wildcard ("%*"), compare literally; consume next two bytes of pattern
+ if len(matchPath) >= iPattern+3 && matchPath[iPattern+1] != '*' {
+ sb.WriteString(escapedPathCh)
+ iPath++
+ iPattern += 2
+ break
+ }
+
+ // escaped wildcard sequence; consume next byte only ('*')
+ iPattern++
+ normalize = false
+
+ fallthrough
+ case '*':
+ // wildcard, so consume until next matching character
+ remaining := escapedPath[iPath:]
+ until := len(escapedPath) - iPath // go until end of string...
+ if iPattern < len(matchPath)-1 { // ...unless the * is not at the end
+ nextCh := matchPath[iPattern+1]
+ until = strings.IndexByte(remaining, nextCh)
+ if until == -1 {
+ // terminating char of wildcard span not found, so definitely no match
+ return false
+ }
+ }
+ if until == 0 {
+ // empty span; nothing to add on this iteration
+ break
+ }
+ next := remaining[:until]
+ if normalize {
+ var err error
+ next, err = url.PathUnescape(next)
+ if err != nil {
+ return false // should be impossible anyway
+ }
+ }
+ sb.WriteString(next)
+ iPath += until
+ default:
+ sb.WriteString(pathCh)
+ iPath++
+ }
+
+ iPattern++
+ }
+
+ // we can now treat rawpath globs (%*) as regular globs (*)
+ matchPath = strings.ReplaceAll(matchPath, "%*", "*")
+
+ // ignore error here because we can't handle it anyway=
+ matches, _ := path.Match(matchPath, sb.String())
+ return matches
+}
+
// CELLibrary produces options that expose this matcher for use in CEL
// expression matchers.
//
// Example:
-// expression path('*substring*', '*suffix')
+//
+// expression path('*substring*', '*suffix')
func (MatchPath) CELLibrary(ctx caddy.Context) (cel.Library, error) {
return CELMatcherImpl(
// name of the macro, this is the function name that users see when writing expressions.
@@ -477,23 +647,10 @@ func (MatchPathRE) CaddyModule() caddy.ModuleInfo {
func (m MatchPathRE) Match(r *http.Request) bool {
repl := r.Context().Value(caddy.ReplacerCtxKey).(*caddy.Replacer)
- // PathUnescape returns an error if the escapes aren't
- // well-formed, meaning the count % matches the RFC.
- // Return early if the escape is improper.
- unescapedPath, err := url.PathUnescape(r.URL.Path)
- if err != nil {
- return false
- }
-
// Clean the path, merges doubled slashes, etc.
// This ensures maliciously crafted requests can't bypass
// the path matcher. See #4407
- cleanedPath := path.Clean(unescapedPath)
-
- // Cleaning may remove the trailing slash, but we want to keep it
- if cleanedPath != "/" && strings.HasSuffix(r.URL.Path, "/") {
- cleanedPath = cleanedPath + "/"
- }
+ cleanedPath := cleanPath(r.URL.Path)
return m.MatchRegexp.Match(cleanedPath, repl)
}
@@ -502,7 +659,8 @@ func (m MatchPathRE) Match(r *http.Request) bool {
// expression matchers.
//
// Example:
-// expression path_regexp('^/bar')
+//
+// expression path_regexp('^/bar')
func (MatchPathRE) CELLibrary(ctx caddy.Context) (cel.Library, error) {
unnamedPattern, err := CELMatcherImpl(
"path_regexp",
@@ -575,7 +733,8 @@ func (m MatchMethod) Match(r *http.Request) bool {
// expression matchers.
//
// Example:
-// expression method('PUT', 'POST')
+//
+// expression method('PUT', 'POST')
func (MatchMethod) CELLibrary(_ caddy.Context) (cel.Library, error) {
return CELMatcherImpl(
"method",
@@ -661,7 +820,8 @@ func (m MatchQuery) Match(r *http.Request) bool {
// expression matchers.
//
// Example:
-// expression query({'sort': 'asc'}) || query({'foo': ['*bar*', 'baz']})
+//
+// expression query({'sort': 'asc'}) || query({'foo': ['*bar*', 'baz']})
func (MatchQuery) CELLibrary(_ caddy.Context) (cel.Library, error) {
return CELMatcherImpl(
"query",
@@ -736,8 +896,9 @@ func (m MatchHeader) Match(r *http.Request) bool {
// expression matchers.
//
// Example:
-// expression header({'content-type': 'image/png'})
-// expression header({'foo': ['bar', 'baz']}) // match bar or baz
+//
+// expression header({'content-type': 'image/png'})
+// expression header({'foo': ['bar', 'baz']}) // match bar or baz
func (MatchHeader) CELLibrary(_ caddy.Context) (cel.Library, error) {
return CELMatcherImpl(
"header",
@@ -894,7 +1055,8 @@ func (m MatchHeaderRE) Validate() error {
// expression matchers.
//
// Example:
-// expression header_regexp('foo', 'Field', 'fo+')
+//
+// expression header_regexp('foo', 'Field', 'fo+')
func (MatchHeaderRE) CELLibrary(ctx caddy.Context) (cel.Library, error) {
unnamedPattern, err := CELMatcherImpl(
"header_regexp",
@@ -978,7 +1140,8 @@ func (m *MatchProtocol) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {
// expression matchers.
//
// Example:
-// expression protocol('https')
+//
+// expression protocol('https')
func (MatchProtocol) CELLibrary(_ caddy.Context) (cel.Library, error) {
return CELMatcherImpl(
"protocol",
@@ -1097,7 +1260,8 @@ func (m *MatchRemoteIP) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {
// expression matchers.
//
// Example:
-// expression remote_ip('forwarded', '192.168.0.0/16', '172.16.0.0/12', '10.0.0.0/8')
+//
+// expression remote_ip('forwarded', '192.168.0.0/16', '172.16.0.0/12', '10.0.0.0/8')
func (MatchRemoteIP) CELLibrary(ctx caddy.Context) (cel.Library, error) {
return CELMatcherImpl(
// name of the macro, this is the function name that users see when writing expressions.