diff options
| author | Matt Holt <mholt@users.noreply.github.com> | 2022-08-16 08:48:57 -0600 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-08-16 08:48:57 -0600 | 
| commit | a479943acd70068c4b80d3a8f4b8dd7ab93ca2ba (patch) | |
| tree | f50a45b3b0c8c8475b783583967c1175f5e6673c /modules/caddyhttp/matchers.go | |
| parent | dc62d468e9645f52a5e1b4f6093dff65137ab3fe (diff) | |
caddyhttp: Smarter path matching and rewriting (#4948)
Co-authored-by: RussellLuo <luopeng.he@gmail.com>
Diffstat (limited to 'modules/caddyhttp/matchers.go')
| -rw-r--r-- | modules/caddyhttp/matchers.go | 318 | 
1 files changed, 241 insertions, 77 deletions
| diff --git a/modules/caddyhttp/matchers.go b/modules/caddyhttp/matchers.go index bb957d6..d6bbe7e 100644 --- a/modules/caddyhttp/matchers.go +++ b/modules/caddyhttp/matchers.go @@ -23,7 +23,6 @@ import (  	"net/textproto"  	"net/url"  	"path" -	"path/filepath"  	"reflect"  	"regexp"  	"sort" @@ -63,20 +62,51 @@ type (  	// Duplicate entries will return an error.  	MatchHost []string -	// MatchPath matches requests by the URI's path (case-insensitive). Path -	// matches are exact, but wildcards may be used: +	// MatchPath case-insensitively matches requests by the URI's path. Path +	// matching is exact, not prefix-based, giving you more control and clarity +	// over matching. Wildcards (`*`) may be used:  	// -	// - At the end, for a prefix match (`/prefix/*`) -	// - At the beginning, for a suffix match (`*.suffix`) -	// - On both sides, for a substring match (`*/contains/*`) +	// - At the end only, for a prefix match (`/prefix/*`) +	// - At the beginning only, for a suffix match (`*.suffix`) +	// - On both sides only, for a substring match (`*/contains/*`)  	// - In the middle, for a globular match (`/accounts/*/info`)  	// +	// Slashes are significant; i.e. `/foo*` matches `/foo`, `/foo/`, `/foo/bar`, +	// and `/foobar`; but `/foo/*` does not match `/foo` or `/foobar`. Valid +	// paths start with a slash `/`. +	// +	// Because there are, in general, multiple possible escaped forms of any +	// path, path matchers operate in unescaped space; that is, path matchers +	// should be written in their unescaped form to prevent ambiguities and +	// possible security issues, as all request paths will be normalized to +	// their unescaped forms before matcher evaluation. +	// +	// However, escape sequences in a match pattern are supported; they are +	// compared with the request's raw/escaped path for those bytes only. +	// In other words, a matcher of `/foo%2Fbar` will match a request path +	// of precisely `/foo%2Fbar`, but not `/foo/bar`. It follows that matching +	// the literal percent sign (%) in normalized space can be done using the +	// escaped form, `%25`. +	// +	// Even though wildcards (`*`) operate in the normalized space, the special +	// escaped wildcard (`%*`), which is not a valid escape sequence, may be +	// used in place of a span that should NOT be decoded; that is, `/bands/%*` +	// will match `/bands/AC%2fDC` whereas `/bands/*` will not. +	// +	// Even though path matching is done in normalized space, the special +	// wildcard `%*` may be used in place of a span that should NOT be decoded; +	// that is, `/bands/%*/` will match `/bands/AC%2fDC/` whereas `/bands/*/` +	// will not. +	//  	// This matcher is fast, so it does not support regular expressions or  	// capture groups. For slower but more powerful matching, use the -	// path_regexp matcher. +	// path_regexp matcher. (Note that due to the special treatment of +	// escape sequences in matcher patterns, they may perform slightly slower +	// in high-traffic environments.)  	MatchPath []string  	// MatchPathRE matches requests by a regular expression on the URI's path. +	// Path matching is performed in the unescaped (decoded) form of the path.  	//  	// Upon a match, it adds placeholders to the request: `{http.regexp.name.capture_group}`  	// where `name` is the regular expression's name, and `capture_group` is either @@ -303,7 +333,8 @@ outer:  // expression matchers.  //  // Example: -//    expression host('localhost') +// +//	expression host('localhost')  func (MatchHost) CELLibrary(ctx caddy.Context) (cel.Library, error) {  	return CELMatcherImpl(  		"host", @@ -342,6 +373,7 @@ func (MatchPath) CaddyModule() caddy.ModuleInfo {  // Provision lower-cases the paths in m to ensure case-insensitive matching.  func (m MatchPath) Provision(_ caddy.Context) error {  	for i := range m { +		// TODO: if m[i] == "*", put it first and delete all others (will always match)  		m[i] = strings.ToLower(m[i])  	}  	return nil @@ -349,77 +381,108 @@ func (m MatchPath) Provision(_ caddy.Context) error {  // Match returns true if r matches m.  func (m MatchPath) Match(r *http.Request) bool { -	// PathUnescape returns an error if the escapes aren't -	// well-formed, meaning the count % matches the RFC. -	// Return early if the escape is improper. -	unescapedPath, err := url.PathUnescape(r.URL.Path) -	if err != nil { -		return false -	} - -	lowerPath := strings.ToLower(unescapedPath) - -	// Clean the path, merges doubled slashes, etc. -	// This ensures maliciously crafted requests can't bypass -	// the path matcher. See #4407 -	lowerPath = path.Clean(lowerPath) - -	// see #2917; Windows ignores trailing dots and spaces +	// Even though RFC 9110 says that path matching is case-sensitive +	// (https://www.rfc-editor.org/rfc/rfc9110.html#section-4.2.3), +	// we do case-insensitive matching to mitigate security issues +	// related to differences between operating systems, applications, +	// etc; if case-sensitive matching is needed, the regex matcher +	// can be used instead. +	reqPath := strings.ToLower(r.URL.Path) + +	// See #2917; Windows ignores trailing dots and spaces  	// when accessing files (sigh), potentially causing a  	// security risk (cry) if PHP files end up being served  	// as static files, exposing the source code, instead of -	// being matched by *.php to be treated as PHP scripts -	lowerPath = strings.TrimRight(lowerPath, ". ") - -	// Cleaning may remove the trailing slash, but we want to keep it -	if lowerPath != "/" && strings.HasSuffix(r.URL.Path, "/") { -		lowerPath = lowerPath + "/" -	} +	// being matched by *.php to be treated as PHP scripts. +	reqPath = strings.TrimRight(reqPath, ". ")  	repl := r.Context().Value(caddy.ReplacerCtxKey).(*caddy.Replacer) -	for _, matchPath := range m { -		matchPath = repl.ReplaceAll(matchPath, "") +	for _, matchPattern := range m { +		matchPattern = repl.ReplaceAll(matchPattern, "")  		// special case: whole path is wildcard; this is unnecessary  		// as it matches all requests, which is the same as no matcher -		if matchPath == "*" { +		if matchPattern == "*" {  			return true  		} -		// special case: first and last characters are wildcard, -		// treat it as a fast substring match -		if len(matchPath) > 1 && -			strings.HasPrefix(matchPath, "*") && -			strings.HasSuffix(matchPath, "*") { -			if strings.Contains(lowerPath, matchPath[1:len(matchPath)-1]) { +		// Clean the path, merge doubled slashes, etc. +		// This ensures maliciously crafted requests can't bypass +		// the path matcher. See #4407. Good security posture +		// requires that we should do all we can to reduce any +		// funny-looking paths into "normalized" forms such that +		// weird variants can't sneak by. +		// +		// How we clean the path depends on the kind of pattern: +		// we either merge slashes or we don't. If the pattern +		// has double slashes, we preserve them in the path. +		// +		// TODO: Despite the fact that the *vast* majority of path +		// matchers have only 1 pattern, a possible optimization is +		// to remember the cleaned form of the path for future +		// iterations; it's just that the way we clean depends on +		// the kind of pattern. + +		mergeSlashes := !strings.Contains(matchPattern, "//") + +		// if '%' appears in the match pattern, we interpret that to mean +		// the intent is to compare that part of the path in raw/escaped +		// space; i.e. "%40"=="%40", not "@", and "%2F"=="%2F", not "/" +		if strings.Contains(matchPattern, "%") { +			reqPathForPattern := CleanPath(r.URL.EscapedPath(), mergeSlashes) +			if m.matchPatternWithEscapeSequence(reqPathForPattern, matchPattern) {  				return true  			} + +			// doing prefix/suffix/substring matches doesn't make sense  			continue  		} -		// special case: first character is a wildcard, -		// treat it as a fast suffix match -		if strings.HasPrefix(matchPath, "*") { -			if strings.HasSuffix(lowerPath, matchPath[1:]) { +		reqPathForPattern := CleanPath(reqPath, mergeSlashes) + +		// for substring, prefix, and suffix matching, only perform those +		// special, fast matches if they are the only wildcards in the pattern; +		// otherwise we assume a globular match if any * appears in the middle + +		// special case: first and last characters are wildcard, +		// treat it as a fast substring match +		if strings.Count(matchPattern, "*") == 2 && +			strings.HasPrefix(matchPattern, "*") && +			strings.HasSuffix(matchPattern, "*") && +			strings.Count(matchPattern, "*") == 2 { +			if strings.Contains(reqPathForPattern, matchPattern[1:len(matchPattern)-1]) {  				return true  			}  			continue  		} -		// special case: last character is a wildcard, -		// treat it as a fast prefix match -		if strings.HasSuffix(matchPath, "*") { -			if strings.HasPrefix(lowerPath, matchPath[:len(matchPath)-1]) { -				return true +		// only perform prefix/suffix match if it is the only wildcard... +		// I think that is more correct most of the time +		if strings.Count(matchPattern, "*") == 1 { +			// special case: first character is a wildcard, +			// treat it as a fast suffix match +			if strings.HasPrefix(matchPattern, "*") { +				if strings.HasSuffix(reqPathForPattern, matchPattern[1:]) { +					return true +				} +				continue +			} + +			// special case: last character is a wildcard, +			// treat it as a fast prefix match +			if strings.HasSuffix(matchPattern, "*") { +				if strings.HasPrefix(reqPathForPattern, matchPattern[:len(matchPattern)-1]) { +					return true +				} +				continue  			} -			continue  		} -		// for everything else, try globular matching, which also -		// is exact matching if there are no glob/wildcard chars; -		// can ignore error here because we can't handle it anyway -		matches, _ := filepath.Match(matchPath, lowerPath) +		// at last, use globular matching, which also is exact matching +		// if there are no glob/wildcard chars; we ignore the error here +		// because we can't handle it anyway +		matches, _ := path.Match(matchPattern, reqPathForPattern)  		if matches {  			return true  		} @@ -427,11 +490,118 @@ func (m MatchPath) Match(r *http.Request) bool {  	return false  } +func (MatchPath) matchPatternWithEscapeSequence(escapedPath, matchPath string) bool { +	// We would just compare the pattern against r.URL.Path, +	// but the pattern contains %, indicating that we should +	// compare at least some part of the path in raw/escaped +	// space, not normalized space; so we build the string we +	// will compare against by adding the normalized parts +	// of the path, then switching to the escaped parts where +	// the pattern hints to us wherever % is present. +	var sb strings.Builder + +	// iterate the pattern and escaped path in lock-step; +	// increment iPattern every time we consume a char from the pattern, +	// increment iPath every time we consume a char from the path; +	// iPattern and iPath are our cursors/iterator positions for each string +	var iPattern, iPath int +	for { +		if iPattern >= len(matchPath) || iPath >= len(escapedPath) { +			break +		} + +		// get the next character from the request path + +		pathCh := string(escapedPath[iPath]) +		var escapedPathCh string + +		// normalize (decode) escape sequences +		if pathCh == "%" && len(escapedPath) >= iPath+3 { +			// hold onto this in case we find out the intent is to match in escaped space here; +			// we lowercase it even though technically the spec says: "For consistency, URI +			// producers and normalizers should use uppercase hexadecimal digits for all percent- +			// encodings" (RFC 3986 section 2.1) - we lowercased the matcher pattern earlier in +			// provisioning so we do the same here to gain case-insensitivity in equivalence; +			// besides, this string is never shown visibly +			escapedPathCh = strings.ToLower(escapedPath[iPath : iPath+3]) + +			var err error +			pathCh, err = url.PathUnescape(escapedPathCh) +			if err != nil { +				// should be impossible unless EscapedPath() is giving us an invalid sequence! +				return false +			} +			iPath += 2 // escape sequence is 2 bytes longer than normal char +		} + +		// now get the next character from the pattern + +		normalize := true +		switch matchPath[iPattern] { +		case '%': +			// escape sequence + +			// if not a wildcard ("%*"), compare literally; consume next two bytes of pattern +			if len(matchPath) >= iPattern+3 && matchPath[iPattern+1] != '*' { +				sb.WriteString(escapedPathCh) +				iPath++ +				iPattern += 2 +				break +			} + +			// escaped wildcard sequence; consume next byte only ('*') +			iPattern++ +			normalize = false + +			fallthrough +		case '*': +			// wildcard, so consume until next matching character +			remaining := escapedPath[iPath:] +			until := len(escapedPath) - iPath // go until end of string... +			if iPattern < len(matchPath)-1 {  // ...unless the * is not at the end +				nextCh := matchPath[iPattern+1] +				until = strings.IndexByte(remaining, nextCh) +				if until == -1 { +					// terminating char of wildcard span not found, so definitely no match +					return false +				} +			} +			if until == 0 { +				// empty span; nothing to add on this iteration +				break +			} +			next := remaining[:until] +			if normalize { +				var err error +				next, err = url.PathUnescape(next) +				if err != nil { +					return false // should be impossible anyway +				} +			} +			sb.WriteString(next) +			iPath += until +		default: +			sb.WriteString(pathCh) +			iPath++ +		} + +		iPattern++ +	} + +	// we can now treat rawpath globs (%*) as regular globs (*) +	matchPath = strings.ReplaceAll(matchPath, "%*", "*") + +	// ignore error here because we can't handle it anyway= +	matches, _ := path.Match(matchPath, sb.String()) +	return matches +} +  // CELLibrary produces options that expose this matcher for use in CEL  // expression matchers.  //  // Example: -//    expression path('*substring*', '*suffix') +// +//	expression path('*substring*', '*suffix')  func (MatchPath) CELLibrary(ctx caddy.Context) (cel.Library, error) {  	return CELMatcherImpl(  		// name of the macro, this is the function name that users see when writing expressions. @@ -477,23 +647,10 @@ func (MatchPathRE) CaddyModule() caddy.ModuleInfo {  func (m MatchPathRE) Match(r *http.Request) bool {  	repl := r.Context().Value(caddy.ReplacerCtxKey).(*caddy.Replacer) -	// PathUnescape returns an error if the escapes aren't -	// well-formed, meaning the count % matches the RFC. -	// Return early if the escape is improper. -	unescapedPath, err := url.PathUnescape(r.URL.Path) -	if err != nil { -		return false -	} -  	// Clean the path, merges doubled slashes, etc.  	// This ensures maliciously crafted requests can't bypass  	// the path matcher. See #4407 -	cleanedPath := path.Clean(unescapedPath) - -	// Cleaning may remove the trailing slash, but we want to keep it -	if cleanedPath != "/" && strings.HasSuffix(r.URL.Path, "/") { -		cleanedPath = cleanedPath + "/" -	} +	cleanedPath := cleanPath(r.URL.Path)  	return m.MatchRegexp.Match(cleanedPath, repl)  } @@ -502,7 +659,8 @@ func (m MatchPathRE) Match(r *http.Request) bool {  // expression matchers.  //  // Example: -//    expression path_regexp('^/bar') +// +//	expression path_regexp('^/bar')  func (MatchPathRE) CELLibrary(ctx caddy.Context) (cel.Library, error) {  	unnamedPattern, err := CELMatcherImpl(  		"path_regexp", @@ -575,7 +733,8 @@ func (m MatchMethod) Match(r *http.Request) bool {  // expression matchers.  //  // Example: -//    expression method('PUT', 'POST') +// +//	expression method('PUT', 'POST')  func (MatchMethod) CELLibrary(_ caddy.Context) (cel.Library, error) {  	return CELMatcherImpl(  		"method", @@ -661,7 +820,8 @@ func (m MatchQuery) Match(r *http.Request) bool {  // expression matchers.  //  // Example: -//    expression query({'sort': 'asc'}) || query({'foo': ['*bar*', 'baz']}) +// +//	expression query({'sort': 'asc'}) || query({'foo': ['*bar*', 'baz']})  func (MatchQuery) CELLibrary(_ caddy.Context) (cel.Library, error) {  	return CELMatcherImpl(  		"query", @@ -736,8 +896,9 @@ func (m MatchHeader) Match(r *http.Request) bool {  // expression matchers.  //  // Example: -//    expression header({'content-type': 'image/png'}) -//    expression header({'foo': ['bar', 'baz']}) // match bar or baz +// +//	expression header({'content-type': 'image/png'}) +//	expression header({'foo': ['bar', 'baz']}) // match bar or baz  func (MatchHeader) CELLibrary(_ caddy.Context) (cel.Library, error) {  	return CELMatcherImpl(  		"header", @@ -894,7 +1055,8 @@ func (m MatchHeaderRE) Validate() error {  // expression matchers.  //  // Example: -//    expression header_regexp('foo', 'Field', 'fo+') +// +//	expression header_regexp('foo', 'Field', 'fo+')  func (MatchHeaderRE) CELLibrary(ctx caddy.Context) (cel.Library, error) {  	unnamedPattern, err := CELMatcherImpl(  		"header_regexp", @@ -978,7 +1140,8 @@ func (m *MatchProtocol) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {  // expression matchers.  //  // Example: -//    expression protocol('https') +// +//	expression protocol('https')  func (MatchProtocol) CELLibrary(_ caddy.Context) (cel.Library, error) {  	return CELMatcherImpl(  		"protocol", @@ -1097,7 +1260,8 @@ func (m *MatchRemoteIP) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {  // expression matchers.  //  // Example: -//    expression remote_ip('forwarded', '192.168.0.0/16', '172.16.0.0/12', '10.0.0.0/8') +// +//	expression remote_ip('forwarded', '192.168.0.0/16', '172.16.0.0/12', '10.0.0.0/8')  func (MatchRemoteIP) CELLibrary(ctx caddy.Context) (cel.Library, error) {  	return CELMatcherImpl(  		// name of the macro, this is the function name that users see when writing expressions. | 
