caddyhttp: Smarter path matching and rewriting (#4948)

Co-authored-by: RussellLuo <luopeng.he@gmail.com>
author: Matt Holt <mholt@users.noreply.github.com> 2022-08-16 08:48:57 -0600
committer: GitHub <noreply@github.com> 2022-08-16 08:48:57 -0600
commit: a479943acd70068c4b80d3a8f4b8dd7ab93ca2ba (patch)
tree: f50a45b3b0c8c8475b783583967c1175f5e6673c
parent: dc62d468e9645f52a5e1b4f6093dff65137ab3fe (diff)
7 files changed, 614 insertions, 112 deletions
diff --git a/modules/caddyhttp/caddyhttp.go b/modules/caddyhttp/caddyhttp.go
index 784b2b9..c9cc9e6 100644
--- a/modules/caddyhttp/caddyhttp.go
+++ b/modules/caddyhttp/caddyhttp.go
@@ -20,6 +20,7 @@ import (
 	"io"
 	"net"
 	"net/http"
+	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -244,6 +245,40 @@ func SanitizedPathJoin(root, reqPath string) string {
 	return path
 }
 
+// CleanPath cleans path p according to path.Clean(), but only
+// merges repeated slashes if collapseSlashes is true, and always
+// preserves trailing slashes.
+func CleanPath(p string, collapseSlashes bool) string {
+	if collapseSlashes {
+		return cleanPath(p)
+	}
+
+	// insert an invalid/impossible URI character into each two consecutive
+	// slashes to expand empty path segments; then clean the path as usual,
+	// and then remove the remaining temporary characters.
+	const tmpCh = 0xff
+	var sb strings.Builder
+	for i, ch := range p {
+		if ch == '/' && i > 0 && p[i-1] == '/' {
+			sb.WriteByte(tmpCh)
+		}
+		sb.WriteRune(ch)
+	}
+	halfCleaned := cleanPath(sb.String())
+	halfCleaned = strings.ReplaceAll(halfCleaned, string([]byte{tmpCh}), "")
+
+	return halfCleaned
+}
+
+// cleanPath does path.Clean(p) but preserves any trailing slash.
+func cleanPath(p string) string {
+	cleaned := path.Clean(p)
+	if cleaned != "/" && strings.HasSuffix(p, "/") {
+		cleaned = cleaned + "/"
+	}
+	return cleaned
+}
+
 // tlsPlaceholderWrapper is a no-op listener wrapper that marks
 // where the TLS listener should be in a chain of listener wrappers.
 // It should only be used if another listener wrapper must be placed
diff --git a/modules/caddyhttp/caddyhttp_test.go b/modules/caddyhttp/caddyhttp_test.go
index 09011fe..1bca4d6 100644
--- a/modules/caddyhttp/caddyhttp_test.go
+++ b/modules/caddyhttp/caddyhttp_test.go
@@ -92,3 +92,60 @@ func TestSanitizedPathJoin(t *testing.T) {
 		}
 	}
 }
+
+func TestCleanPath(t *testing.T) {
+	for i, tc := range []struct {
+		input        string
+		mergeSlashes bool
+		expect       string
+	}{
+		{
+			input:  "/foo",
+			expect: "/foo",
+		},
+		{
+			input:  "/foo/",
+			expect: "/foo/",
+		},
+		{
+			input:  "//foo",
+			expect: "//foo",
+		},
+		{
+			input:        "//foo",
+			mergeSlashes: true,
+			expect:       "/foo",
+		},
+		{
+			input:        "/foo//bar/",
+			mergeSlashes: true,
+			expect:       "/foo/bar/",
+		},
+		{
+			input:  "/foo/./.././bar",
+			expect: "/bar",
+		},
+		{
+			input:  "/foo//./..//./bar",
+			expect: "/foo//bar",
+		},
+		{
+			input:  "/foo///./..//./bar",
+			expect: "/foo///bar",
+		},
+		{
+			input:  "/foo///./..//.",
+			expect: "/foo//",
+		},
+		{
+			input:  "/foo//./bar",
+			expect: "/foo//bar",
+		},
+	} {
+		actual := CleanPath(tc.input, tc.mergeSlashes)
+		if actual != tc.expect {
+			t.Errorf("Test %d [input='%s' mergeSlashes=%t]: Got '%s', expected '%s'",
+				i, tc.input, tc.mergeSlashes, actual, tc.expect)
+		}
+	}
+}
diff --git a/modules/caddyhttp/fileserver/staticfiles.go b/modules/caddyhttp/fileserver/staticfiles.go
index a745d5a..554f8d3 100644
--- a/modules/caddyhttp/fileserver/staticfiles.go
+++ b/modules/caddyhttp/fileserver/staticfiles.go
@@ -23,7 +23,6 @@ import (
 	weakrand "math/rand"
 	"mime"
 	"net/http"
-	"net/url"
 	"os"
 	"path"
 	"path/filepath"
@@ -236,16 +235,7 @@ func (fsrv *FileServer) ServeHTTP(w http.ResponseWriter, r *http.Request, next c
 	filesToHide := fsrv.transformHidePaths(repl)
 
 	root := repl.ReplaceAll(fsrv.Root, ".")
-	// PathUnescape returns an error if the escapes aren't well-formed,
-	// meaning the count % matches the RFC. Return early if the escape is
-	// improper.
-	if _, err := url.PathUnescape(r.URL.Path); err != nil {
-		fsrv.logger.Debug("improper path escape",
-			zap.String("site_root", root),
-			zap.String("request_path", r.URL.Path),
-			zap.Error(err))
-		return err
-	}
+
 	filename := caddyhttp.SanitizedPathJoin(root, r.URL.Path)
 
 	fsrv.logger.Debug("sanitized path join",
diff --git a/modules/caddyhttp/matchers.go b/modules/caddyhttp/matchers.go
index bb957d6..d6bbe7e 100644
--- a/modules/caddyhttp/matchers.go
+++ b/modules/caddyhttp/matchers.go
@@ -23,7 +23,6 @@ import (
 	"net/textproto"
 	"net/url"
 	"path"
-	"path/filepath"
 	"reflect"
 	"regexp"
 	"sort"
@@ -63,20 +62,51 @@ type (
 	// Duplicate entries will return an error.
 	MatchHost []string
 
-	// MatchPath matches requests by the URI's path (case-insensitive). Path
-	// matches are exact, but wildcards may be used:
+	// MatchPath case-insensitively matches requests by the URI's path. Path
+	// matching is exact, not prefix-based, giving you more control and clarity
+	// over matching. Wildcards (`*`) may be used:
 	//
-	// - At the end, for a prefix match (`/prefix/*`)
-	// - At the beginning, for a suffix match (`*.suffix`)
-	// - On both sides, for a substring match (`*/contains/*`)
+	// - At the end only, for a prefix match (`/prefix/*`)
+	// - At the beginning only, for a suffix match (`*.suffix`)
+	// - On both sides only, for a substring match (`*/contains/*`)
 	// - In the middle, for a globular match (`/accounts/*/info`)
 	//
+	// Slashes are significant; i.e. `/foo*` matches `/foo`, `/foo/`, `/foo/bar`,
+	// and `/foobar`; but `/foo/*` does not match `/foo` or `/foobar`. Valid
+	// paths start with a slash `/`.
+	//
+	// Because there are, in general, multiple possible escaped forms of any
+	// path, path matchers operate in unescaped space; that is, path matchers
+	// should be written in their unescaped form to prevent ambiguities and
+	// possible security issues, as all request paths will be normalized to
+	// their unescaped forms before matcher evaluation.
+	//
+	// However, escape sequences in a match pattern are supported; they are
+	// compared with the request's raw/escaped path for those bytes only.
+	// In other words, a matcher of `/foo%2Fbar` will match a request path
+	// of precisely `/foo%2Fbar`, but not `/foo/bar`. It follows that matching
+	// the literal percent sign (%) in normalized space can be done using the
+	// escaped form, `%25`.
+	//
+	// Even though wildcards (`*`) operate in the normalized space, the special
+	// escaped wildcard (`%*`), which is not a valid escape sequence, may be
+	// used in place of a span that should NOT be decoded; that is, `/bands/%*`
+	// will match `/bands/AC%2fDC` whereas `/bands/*` will not.
+	//
+	// Even though path matching is done in normalized space, the special
+	// wildcard `%*` may be used in place of a span that should NOT be decoded;
+	// that is, `/bands/%*/` will match `/bands/AC%2fDC/` whereas `/bands/*/`
+	// will not.
+	//
 	// This matcher is fast, so it does not support regular expressions or
 	// capture groups. For slower but more powerful matching, use the
-	// path_regexp matcher.
+	// path_regexp matcher. (Note that due to the special treatment of
+	// escape sequences in matcher patterns, they may perform slightly slower
+	// in high-traffic environments.)
 	MatchPath []string
 
 	// MatchPathRE matches requests by a regular expression on the URI's path.
+	// Path matching is performed in the unescaped (decoded) form of the path.
 	//
 	// Upon a match, it adds placeholders to the request: `{http.regexp.name.capture_group}`
 	// where `name` is the regular expression's name, and `capture_group` is either
@@ -303,7 +333,8 @@ outer:
 // expression matchers.
 //
 // Example:
-//    expression host('localhost')
+//
+//	expression host('localhost')
 func (MatchHost) CELLibrary(ctx caddy.Context) (cel.Library, error) {
 	return CELMatcherImpl(
 		"host",
@@ -342,6 +373,7 @@ func (MatchPath) CaddyModule() caddy.ModuleInfo {
 // Provision lower-cases the paths in m to ensure case-insensitive matching.
 func (m MatchPath) Provision(_ caddy.Context) error {
 	for i := range m {
+		// TODO: if m[i] == "*", put it first and delete all others (will always match)
 		m[i] = strings.ToLower(m[i])
 	}
 	return nil
@@ -349,77 +381,108 @@ func (m MatchPath) Provision(_ caddy.Context) error {
 
 // Match returns true if r matches m.
 func (m MatchPath) Match(r *http.Request) bool {
-	// PathUnescape returns an error if the escapes aren't
-	// well-formed, meaning the count % matches the RFC.
-	// Return early if the escape is improper.
-	unescapedPath, err := url.PathUnescape(r.URL.Path)
-	if err != nil {
-		return false
-	}
-
-	lowerPath := strings.ToLower(unescapedPath)
-
-	// Clean the path, merges doubled slashes, etc.
-	// This ensures maliciously crafted requests can't bypass
-	// the path matcher. See #4407
-	lowerPath = path.Clean(lowerPath)
-
-	// see #2917; Windows ignores trailing dots and spaces
+	// Even though RFC 9110 says that path matching is case-sensitive
+	// (https://www.rfc-editor.org/rfc/rfc9110.html#section-4.2.3),
+	// we do case-insensitive matching to mitigate security issues
+	// related to differences between operating systems, applications,
+	// etc; if case-sensitive matching is needed, the regex matcher
+	// can be used instead.
+	reqPath := strings.ToLower(r.URL.Path)
+
+	// See #2917; Windows ignores trailing dots and spaces
 	// when accessing files (sigh), potentially causing a
 	// security risk (cry) if PHP files end up being served
 	// as static files, exposing the source code, instead of
-	// being matched by *.php to be treated as PHP scripts
-	lowerPath = strings.TrimRight(lowerPath, ". ")
-
-	// Cleaning may remove the trailing slash, but we want to keep it
-	if lowerPath != "/" && strings.HasSuffix(r.URL.Path, "/") {
-		lowerPath = lowerPath + "/"
-	}
+	// being matched by *.php to be treated as PHP scripts.
+	reqPath = strings.TrimRight(reqPath, ". ")
 
 	repl := r.Context().Value(caddy.ReplacerCtxKey).(*caddy.Replacer)
 
-	for _, matchPath := range m {
-		matchPath = repl.ReplaceAll(matchPath, "")
+	for _, matchPattern := range m {
+		matchPattern = repl.ReplaceAll(matchPattern, "")
 
 		// special case: whole path is wildcard; this is unnecessary
 		// as it matches all requests, which is the same as no matcher
-		if matchPath == "*" {
+		if matchPattern == "*" {
 			return true
 		}
 
-		// special case: first and last characters are wildcard,
-		// treat it as a fast substring match
-		if len(matchPath) > 1 &&
-			strings.HasPrefix(matchPath, "*") &&
-			strings.HasSuffix(matchPath, "*") {
-			if strings.Contains(lowerPath, matchPath[1:len(matchPath)-1]) {
+		// Clean the path, merge doubled slashes, etc.
+		// This ensures maliciously crafted requests can't bypass
+		// the path matcher. See #4407. Good security posture
+		// requires that we should do all we can to reduce any
+		// funny-looking paths into "normalized" forms such that
+		// weird variants can't sneak by.
+		//
+		// How we clean the path depends on the kind of pattern:
+		// we either merge slashes or we don't. If the pattern
+		// has double slashes, we preserve them in the path.
+		//
+		// TODO: Despite the fact that the *vast* majority of path
+		// matchers have only 1 pattern, a possible optimization is
+		// to remember the cleaned form of the path for future
+		// iterations; it's just that the way we clean depends on
+		// the kind of pattern.
+
+		mergeSlashes := !strings.Contains(matchPattern, "//")
+
+		// if '%' appears in the match pattern, we interpret that to mean
+		// the intent is to compare that part of the path in raw/escaped
+		// space; i.e. "%40"=="%40", not "@", and "%2F"=="%2F", not "/"
+		if strings.Contains(matchPattern, "%") {
+			reqPathForPattern := CleanPath(r.URL.EscapedPath(), mergeSlashes)
+			if m.matchPatternWithEscapeSequence(reqPathForPattern, matchPattern) {
 				return true
 			}
+
+			// doing prefix/suffix/substring matches doesn't make sense
 			continue
 		}
 
-		// special case: first character is a wildcard,
-		// treat it as a fast suffix match
-		if strings.HasPrefix(matchPath, "*") {
-			if strings.HasSuffix(lowerPath, matchPath[1:]) {
+		reqPathForPattern := CleanPath(reqPath, mergeSlashes)
+
+		// for substring, prefix, and suffix matching, only perform those
+		// special, fast matches if they are the only wildcards in the pattern;
+		// otherwise we assume a globular match if any * appears in the middle
+
+		// special case: first and last characters are wildcard,
+		// treat it as a fast substring match
+		if strings.Count(matchPattern, "*") == 2 &&
+			strings.HasPrefix(matchPattern, "*") &&
+			strings.HasSuffix(matchPattern, "*") &&
+			strings.Count(matchPattern, "*") == 2 {
+			if strings.Contains(reqPathForPattern, matchPattern[1:len(matchPattern)-1]) {
 				return true
 			}
 			continue
 		}
 
-		// special case: last character is a wildcard,
-		// treat it as a fast prefix match
-		if strings.HasSuffix(matchPath, "*") {
-			if strings.HasPrefix(lowerPath, matchPath[:len(matchPath)-1]) {
-				return true
+		// only perform prefix/suffix match if it is the only wildcard...
+		// I think that is more correct most of the time
+		if strings.Count(matchPattern, "*") == 1 {
+			// special case: first character is a wildcard,
+			// treat it as a fast suffix match
+			if strings.HasPrefix(matchPattern, "*") {
+				if strings.HasSuffix(reqPathForPattern, matchPattern[1:]) {
+					return true
+				}
+				continue
+			}
+
+			// special case: last character is a wildcard,
+			// treat it as a fast prefix match
+			if strings.HasSuffix(matchPattern, "*") {
+				if strings.HasPrefix(reqPathForPattern, matchPattern[:len(matchPattern)-1]) {
+					return true
+				}
+				continue
 			}
-			continue
 		}
 
-		// for everything else, try globular matching, which also
-		// is exact matching if there are no glob/wildcard chars;
-		// can ignore error here because we can't handle it anyway
-		matches, _ := filepath.Match(matchPath, lowerPath)
+		// at last, use globular matching, which also is exact matching
+		// if there are no glob/wildcard chars; we ignore the error here
+		// because we can't handle it anyway
+		matches, _ := path.Match(matchPattern, reqPathForPattern)
 		if matches {
 			return true
 		}
@@ -427,11 +490,118 @@ func (m MatchPath) Match(r *http.Request) bool {
 	return false
 }
 
+func (MatchPath) matchPatternWithEscapeSequence(escapedPath, matchPath string) bool {
+	// We would just compare the pattern against r.URL.Path,
+	// but the pattern contains %, indicating that we should
+	// compare at least some part of the path in raw/escaped
+	// space, not normalized space; so we build the string we
+	// will compare against by adding the normalized parts
+	// of the path, then switching to the escaped parts where
+	// the pattern hints to us wherever % is present.
+	var sb strings.Builder
+
+	// iterate the pattern and escaped path in lock-step;
+	// increment iPattern every time we consume a char from the pattern,
+	// increment iPath every time we consume a char from the path;
+	// iPattern and iPath are our cursors/iterator positions for each string
+	var iPattern, iPath int
+	for {
+		if iPattern >= len(matchPath) || iPath >= len(escapedPath) {
+			break
+		}
+
+		// get the next character from the request path
+
+		pathCh := string(escapedPath[iPath])
+		var escapedPathCh string
+
+		// normalize (decode) escape sequences
+		if pathCh == "%" && len(escapedPath) >= iPath+3 {
+			// hold onto this in case we find out the intent is to match in escaped space here;
+			// we lowercase it even though technically the spec says: "For consistency, URI
+			// producers and normalizers should use uppercase hexadecimal digits for all percent-
+			// encodings" (RFC 3986 section 2.1) - we lowercased the matcher pattern earlier in
+			// provisioning so we do the same here to gain case-insensitivity in equivalence;
+			// besides, this string is never shown visibly
+			escapedPathCh = strings.ToLower(escapedPath[iPath : iPath+3])
+
+			var err error
+			pathCh, err = url.PathUnescape(escapedPathCh)
+			if err != nil {
+				// should be impossible unless EscapedPath() is giving us an invalid sequence!
+				return false
+			}
+			iPath += 2 // escape sequence is 2 bytes longer than normal char
+		}
+
+		// now get the next character from the pattern
+
+		normalize := true
+		switch matchPath[iPattern] {
+		case '%':
+			// escape sequence
+
+			// if not a wildcard ("%*"), compare literally; consume next two bytes of pattern
+			if len(matchPath) >= iPattern+3 && matchPath[iPattern+1] != '*' {
+				sb.WriteString(escapedPathCh)
+				iPath++
+				iPattern += 2
+				break
+			}
+
+			// escaped wildcard sequence; consume next byte only ('*')
+			iPattern++
+			normalize = false
+
+			fallthrough
+		case '*':
+			// wildcard, so consume until next matching character
+			remaining := escapedPath[iPath:]
+			until := len(escapedPath) - iPath // go until end of string...
+			if iPattern < len(matchPath)-1 {  // ...unless the * is not at the end
+				nextCh := matchPath[iPattern+1]
+				until = strings.IndexByte(remaining, nextCh)
+				if until == -1 {
+					// terminating char of wildcard span not found, so definitely no match
+					return false
+				}
+			}
+			if until == 0 {
+				// empty span; nothing to add on this iteration
+				break
+			}
+			next := remaining[:until]
+			if normalize {
+				var err error
+				next, err = url.PathUnescape(next)
+				if err != nil {
+					return false // should be impossible anyway
+				}
+			}
+			sb.WriteString(next)
+			iPath += until
+		default:
+			sb.WriteString(pathCh)
+			iPath++
+		}
+
+		iPattern++
+	}
+
+	// we can now treat rawpath globs (%*) as regular globs (*)
+	matchPath = strings.ReplaceAll(matchPath, "%*", "*")
+
+	// ignore error here because we can't handle it anyway=
+	matches, _ := path.Match(matchPath, sb.String())
+	return matches
+}
+
 // CELLibrary produces options that expose this matcher for use in CEL
 // expression matchers.
 //
 // Example:
-//    expression path('*substring*', '*suffix')
+//
+//	expression path('*substring*', '*suffix')
 func (MatchPath) CELLibrary(ctx caddy.Context) (cel.Library, error) {
 	return CELMatcherImpl(
 		// name of the macro, this is the function name that users see when writing expressions.
@@ -477,23 +647,10 @@ func (MatchPathRE) CaddyModule() caddy.ModuleInfo {
 func (m MatchPathRE) Match(r *http.Request) bool {
 	repl := r.Context().Value(caddy.ReplacerCtxKey).(*caddy.Replacer)
 
-	// PathUnescape returns an error if the escapes aren't
-	// well-formed, meaning the count % matches the RFC.
-	// Return early if the escape is improper.
-	unescapedPath, err := url.PathUnescape(r.URL.Path)
-	if err != nil {
-		return false
-	}
-
 	// Clean the path, merges doubled slashes, etc.
 	// This ensures maliciously crafted requests can't bypass
 	// the path matcher. See #4407
-	cleanedPath := path.Clean(unescapedPath)
-
-	// Cleaning may remove the trailing slash, but we want to keep it
-	if cleanedPath != "/" && strings.HasSuffix(r.URL.Path, "/") {
-		cleanedPath = cleanedPath + "/"
-	}
+	cleanedPath := cleanPath(r.URL.Path)
 
 	return m.MatchRegexp.Match(cleanedPath, repl)
 }
@@ -502,7 +659,8 @@ func (m MatchPathRE) Match(r *http.Request) bool {
 // expression matchers.
 //
 // Example:
-//    expression path_regexp('^/bar')
+//
+//	expression path_regexp('^/bar')
 func (MatchPathRE) CELLibrary(ctx caddy.Context) (cel.Library, error) {
 	unnamedPattern, err := CELMatcherImpl(
 		"path_regexp",
@@ -575,7 +733,8 @@ func (m MatchMethod) Match(r *http.Request) bool {
 // expression matchers.
 //
 // Example:
-//    expression method('PUT', 'POST')
+//
+//	expression method('PUT', 'POST')
 func (MatchMethod) CELLibrary(_ caddy.Context) (cel.Library, error) {
 	return CELMatcherImpl(
 		"method",
@@ -661,7 +820,8 @@ func (m MatchQuery) Match(r *http.Request) bool {
 // expression matchers.
 //
 // Example:
-//    expression query({'sort': 'asc'}) || query({'foo': ['*bar*', 'baz']})
+//
+//	expression query({'sort': 'asc'}) || query({'foo': ['*bar*', 'baz']})
 func (MatchQuery) CELLibrary(_ caddy.Context) (cel.Library, error) {
 	return CELMatcherImpl(
 		"query",
@@ -736,8 +896,9 @@ func (m MatchHeader) Match(r *http.Request) bool {
 // expression matchers.
 //
 // Example:
-//    expression header({'content-type': 'image/png'})
-//    expression header({'foo': ['bar', 'baz']}) // match bar or baz
+//
+//	expression header({'content-type': 'image/png'})
+//	expression header({'foo': ['bar', 'baz']}) // match bar or baz
 func (MatchHeader) CELLibrary(_ caddy.Context) (cel.Library, error) {
 	return CELMatcherImpl(
 		"header",
@@ -894,7 +1055,8 @@ func (m MatchHeaderRE) Validate() error {
 // expression matchers.
 //
 // Example:
-//    expression header_regexp('foo', 'Field', 'fo+')
+//
+//	expression header_regexp('foo', 'Field', 'fo+')
 func (MatchHeaderRE) CELLibrary(ctx caddy.Context) (cel.Library, error) {
 	unnamedPattern, err := CELMatcherImpl(
 		"header_regexp",
@@ -978,7 +1140,8 @@ func (m *MatchProtocol) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {
 // expression matchers.
 //
 // Example:
-//    expression protocol('https')
+//
+//	expression protocol('https')
 func (MatchProtocol) CELLibrary(_ caddy.Context) (cel.Library, error) {
 	return CELMatcherImpl(
 		"protocol",
@@ -1097,7 +1260,8 @@ func (m *MatchRemoteIP) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {
 // expression matchers.
 //
 // Example:
-//    expression remote_ip('forwarded', '192.168.0.0/16', '172.16.0.0/12', '10.0.0.0/8')
+//
+//	expression remote_ip('forwarded', '192.168.0.0/16', '172.16.0.0/12', '10.0.0.0/8')
 func (MatchRemoteIP) CELLibrary(ctx caddy.Context) (cel.Library, error) {
 	return CELMatcherImpl(
 		// name of the macro, this is the function name that users see when writing expressions.
diff --git a/modules/caddyhttp/matchers_test.go b/modules/caddyhttp/matchers_test.go
index bd4606b..4d5538c 100644
--- a/modules/caddyhttp/matchers_test.go
+++ b/modules/caddyhttp/matchers_test.go
@@ -158,9 +158,10 @@ func TestHostMatcher(t *testing.T) {
 
 func TestPathMatcher(t *testing.T) {
 	for i, tc := range []struct {
-		match  MatchPath
-		input  string
-		expect bool
+		match        MatchPath // not URI-encoded because not parsing from a URI
+		input        string    // should be valid URI encoding (escaped) since it will become part of a request
+		expect       bool
+		provisionErr bool
 	}{
 		{
 			match:  MatchPath{},
@@ -253,6 +254,11 @@ func TestPathMatcher(t *testing.T) {
 			expect: true,
 		},
 		{
+			match:  MatchPath{"*.php"},
+			input:  "/foo/index.php. .",
+			expect: true,
+		},
+		{
 			match:  MatchPath{"/foo/bar.txt"},
 			input:  "/foo/BAR.txt",
 			expect: true,
@@ -263,11 +269,61 @@ func TestPathMatcher(t *testing.T) {
 			expect: true,
 		},
 		{
-			match:  MatchPath{"/foo*"},
+			match:  MatchPath{"/foo"},
 			input:  "//foo",
 			expect: true,
 		},
 		{
+			match:  MatchPath{"//foo"},
+			input:  "/foo",
+			expect: false,
+		},
+		{
+			match:  MatchPath{"//foo"},
+			input:  "//foo",
+			expect: true,
+		},
+		{
+			match:  MatchPath{"/foo//*"},
+			input:  "/foo//bar",
+			expect: true,
+		},
+		{
+			match:  MatchPath{"/foo//*"},
+			input:  "/foo/%2Fbar",
+			expect: true,
+		},
+		{
+			match:  MatchPath{"/foo/%2F*"},
+			input:  "/foo/%2Fbar",
+			expect: true,
+		},
+		{
+			match:  MatchPath{"/foo/%2F*"},
+			input:  "/foo//bar",
+			expect: false,
+		},
+		{
+			match:  MatchPath{"/foo//bar"},
+			input:  "/foo//bar",
+			expect: true,
+		},
+		{
+			match:  MatchPath{"/foo/*//bar"},
+			input:  "/foo///bar",
+			expect: true,
+		},
+		{
+			match:  MatchPath{"/foo/%*//bar"},
+			input:  "/foo///bar",
+			expect: true,
+		},
+		{
+			match:  MatchPath{"/foo/%*//bar"},
+			input:  "/foo//%2Fbar",
+			expect: true,
+		},
+		{
 			match:  MatchPath{"/foo*"},
 			input:  "/%2F/foo",
 			expect: true,
@@ -292,8 +348,79 @@ func TestPathMatcher(t *testing.T) {
 			input:  "/foo/bar",
 			expect: true,
 		},
+		// notice these next three test cases are the same normalized path but are written differently
+		{
+			match:  MatchPath{"/%25@.txt"},
+			input:  "/%25@.txt",
+			expect: true,
+		},
+		{
+			match:  MatchPath{"/%25@.txt"},
+			input:  "/%25%40.txt",
+			expect: true,
+		},
+		{
+			match:  MatchPath{"/%25%40.txt"},
+			input:  "/%25%40.txt",
+			expect: true,
+		},
+		{
+			match:  MatchPath{"/bands/*/*"},
+			input:  "/bands/AC%2FDC/T.N.T",
+			expect: false, // because * operates in normalized space
+		},
+		{
+			match:  MatchPath{"/bands/%*/%*"},
+			input:  "/bands/AC%2FDC/T.N.T",
+			expect: true,
+		},
+		{
+			match:  MatchPath{"/bands/%*/%*"},
+			input:  "/bands/AC/DC/T.N.T",
+			expect: false,
+		},
+		{
+			match:  MatchPath{"/bands/%*"},
+			input:  "/bands/AC/DC",
+			expect: false, // not a suffix match
+		},
+		{
+			match:  MatchPath{"/bands/%*"},
+			input:  "/bands/AC%2FDC",
+			expect: true,
+		},
+		{
+			match:  MatchPath{"/foo%2fbar/baz"},
+			input:  "/foo%2Fbar/baz",
+			expect: true,
+		},
+		{
+			match:  MatchPath{"/foo%2fbar/baz"},
+			input:  "/foo/bar/baz",
+			expect: false,
+		},
+		{
+			match:  MatchPath{"/foo/bar/baz"},
+			input:  "/foo%2fbar/baz",
+			expect: true,
+		},
 	} {
-		req := &http.Request{URL: &url.URL{Path: tc.input}}
+		err := tc.match.Provision(caddy.Context{})
+		if err == nil && tc.provisionErr {
+			t.Errorf("Test %d %v: Expected error provisioning, but there was no error", i, tc.match)
+		}
+		if err != nil && !tc.provisionErr {
+			t.Errorf("Test %d %v: Expected no error provisioning, but there was an error: %v", i, tc.match, err)
+		}
+		if tc.provisionErr {
+			continue // if it's not supposed to provision properly, pointless to test it
+		}
+
+		u, err := url.ParseRequestURI(tc.input)
+		if err != nil {
+			t.Fatalf("Test %d (%v): Invalid request URI (should be rejected by Go's HTTP server): %v", i, tc.input, err)
+		}
+		req := &http.Request{URL: u}
 		repl := caddy.NewReplacer()
 		ctx := context.WithValue(req.Context(), caddy.ReplacerCtxKey, repl)
 		req = req.WithContext(ctx)
@@ -387,6 +514,16 @@ func TestPathREMatcher(t *testing.T) {
 			expect:     true,
 			expectRepl: map[string]string{"name.myparam": "bar"},
 		},
+		{
+			match:  MatchPathRE{MatchRegexp{Pattern: "^/%@.txt"}},
+			input:  "/%25@.txt",
+			expect: true,
+		},
+		{
+			match:  MatchPathRE{MatchRegexp{Pattern: "^/%25@.txt"}},
+			input:  "/%25@.txt",
+			expect: false,
+		},
 	} {
 		// compile the regexp and validate its name
 		err := tc.match.Provision(caddy.Context{})
@@ -401,7 +538,11 @@ func TestPathREMatcher(t *testing.T) {
 		}
 
 		// set up the fake request and its Replacer
-		req := &http.Request{URL: &url.URL{Path: tc.input}}
+		u, err := url.ParseRequestURI(tc.input)
+		if err != nil {
+			t.Fatalf("Test %d: Bad input URI: %v", i, err)
+		}
+		req := &http.Request{URL: u}
 		repl := caddy.NewReplacer()
 		ctx := context.WithValue(req.Context(), caddy.ReplacerCtxKey, repl)
 		req = req.WithContext(ctx)
diff --git a/modules/caddyhttp/rewrite/rewrite.go b/modules/caddyhttp/rewrite/rewrite.go
index 7da2327..4316b5a 100644
--- a/modules/caddyhttp/rewrite/rewrite.go
+++ b/modules/caddyhttp/rewrite/rewrite.go
@@ -31,13 +31,25 @@ func init() {
 	caddy.RegisterModule(Rewrite{})
 }
 
-// Rewrite is a middleware which can rewrite HTTP requests.
+// Rewrite is a middleware which can rewrite/mutate HTTP requests.
 //
-// The Method and URI properties are "setters": the request URI
-// will be set to the given values. Other properties are "modifiers":
-// they modify existing files but do not explicitly specify what the
-// result will be. It is atypical to combine the use of setters and
+// The Method and URI properties are "setters" (the request URI
+// will be overwritten with the given values). Other properties are
+// "modifiers" (they modify existing values in a differentiable
+// way). It is atypical to combine the use of setters and
 // modifiers in a single rewrite.
+//
+// To ensure consistent behavior, prefix and suffix stripping is
+// performed in the URL-decoded (unescaped, normalized) space by
+// default except for the specific bytes where an escape sequence
+// is used in the prefix or suffix pattern.
+//
+// For all modifiers, paths are cleaned before being modified so that
+// multiple, consecutive slashes are collapsed into a single slash,
+// and dot elements are resolved and removed. In the special case
+// of a prefix, suffix, or substring containing "//" (repeated slashes),
+// slashes will not be merged while cleaning the path so that
+// the rewrite can be interpreted literally.
 type Rewrite struct {
 	// Changes the request's HTTP verb.
 	Method string `json:"method,omitempty"`
@@ -59,9 +71,15 @@ type Rewrite struct {
 	URI string `json:"uri,omitempty"`
 
 	// Strips the given prefix from the beginning of the URI path.
+	// The prefix should be written in normalized (unescaped) form,
+	// but if an escaping (`%xx`) is used, the path will be required
+	// to have that same escape at that position in order to match.
 	StripPathPrefix string `json:"strip_path_prefix,omitempty"`
 
 	// Strips the given suffix from the end of the URI path.
+	// The suffix should be written in normalized (unescaped) form,
+	// but if an escaping (`%xx`) is used, the path will be required
+	// to have that same escape at that position in order to match.
 	StripPathSuffix string `json:"strip_path_suffix,omitempty"`
 
 	// Performs substring replacements on the URI.
@@ -227,17 +245,18 @@ func (rewr Rewrite) Rewrite(r *http.Request, repl *caddy.Replacer) bool {
 	// strip path prefix or suffix
 	if rewr.StripPathPrefix != "" {
 		prefix := repl.ReplaceAll(rewr.StripPathPrefix, "")
-		r.URL.RawPath = strings.TrimPrefix(r.URL.RawPath, prefix)
-		if p, err := url.PathUnescape(r.URL.RawPath); err == nil && p != "" {
-			r.URL.Path = p
-		} else {
-			r.URL.Path = strings.TrimPrefix(r.URL.Path, prefix)
-		}
+		mergeSlashes := !strings.Contains(prefix, "//")
+		changePath(r, func(escapedPath string) string {
+			escapedPath = caddyhttp.CleanPath(escapedPath, mergeSlashes)
+			return trimPathPrefix(escapedPath, prefix)
+		})
 	}
 	if rewr.StripPathSuffix != "" {
 		suffix := repl.ReplaceAll(rewr.StripPathSuffix, "")
-		changePath(r, func(pathOrRawPath string) string {
-			return strings.TrimSuffix(pathOrRawPath, suffix)
+		mergeSlashes := !strings.Contains(suffix, "//")
+		changePath(r, func(escapedPath string) string {
+			escapedPath = caddyhttp.CleanPath(escapedPath, mergeSlashes)
+			return reverse(trimPathPrefix(reverse(escapedPath), reverse(suffix)))
 		})
 	}
 
@@ -324,6 +343,58 @@ func buildQueryString(qs string, repl *caddy.Replacer) string {
 	return sb.String()
 }
 
+// trimPathPrefix is like strings.TrimPrefix, but customized for advanced URI
+// path prefix matching. The string prefix will be trimmed from the beginning
+// of escapedPath if escapedPath starts with prefix. Rather than a naive 1:1
+// comparison of each byte to determine if escapedPath starts with prefix,
+// both strings are iterated in lock-step, and if prefix has a '%' encoding
+// at a particular position, escapedPath must also have the same encoding
+// representation for that character. In other words, if the prefix string
+// uses the escaped form for a character, escapedPath must literally use the
+// same escape at that position. Otherwise, all character comparisons are
+// performed in normalized/unescaped space.
+func trimPathPrefix(escapedPath, prefix string) string {
+	var iPath, iPrefix int
+	for {
+		if iPath >= len(escapedPath) || iPrefix >= len(prefix) {
+			break
+		}
+
+		prefixCh := prefix[iPrefix]
+		ch := string(escapedPath[iPath])
+
+		if ch == "%" && prefixCh != '%' && len(escapedPath) >= iPath+3 {
+			var err error
+			ch, err = url.PathUnescape(escapedPath[iPath : iPath+3])
+			if err != nil {
+				// should be impossible unless EscapedPath() is returning invalid values!
+				return escapedPath
+			}
+			iPath += 2
+		}
+
+		// prefix comparisons are case-insensitive to consistency with
+		// path matcher, which is case-insensitive for good reasons
+		if !strings.EqualFold(ch, string(prefixCh)) {
+			return escapedPath
+		}
+
+		iPath++
+		iPrefix++
+	}
+
+	// found matching prefix, trim it
+	return escapedPath[iPath:]
+}
+
+func reverse(s string) string {
+	r := []rune(s)
+	for i, j := 0, len(r)-1; i < len(r)/2; i, j = i+1, j-1 {
+		r[i], r[j] = r[j], r[i]
+	}
+	return string(r)
+}
+
 // substrReplacer describes either a simple and fast substring replacement.
 type substrReplacer struct {
 	// A substring to find. Supports placeholders.
@@ -351,8 +422,10 @@ func (rep substrReplacer) do(r *http.Request, repl *caddy.Replacer) {
 	find := repl.ReplaceAll(rep.Find, "")
 	replace := repl.ReplaceAll(rep.Replace, "")
 
+	mergeSlashes := !strings.Contains(rep.Find, "//")
+
 	changePath(r, func(pathOrRawPath string) string {
-		return strings.Replace(pathOrRawPath, find, replace, lim)
+		return strings.Replace(caddyhttp.CleanPath(pathOrRawPath, mergeSlashes), find, replace, lim)
 	})
 
 	r.URL.RawQuery = strings.Replace(r.URL.RawQuery, find, replace, lim)
@@ -380,16 +453,17 @@ func (rep regexReplacer) do(r *http.Request, repl *caddy.Replacer) {
 	})
 }
 
-// changePath updates the path on the request URL. It first executes newVal on
-// req.URL.RawPath, and if the result is a valid escaping, it will be copied
-// into req.URL.Path; otherwise newVal is evaluated only on req.URL.Path.
 func changePath(req *http.Request, newVal func(pathOrRawPath string) string) {
-	req.URL.RawPath = newVal(req.URL.RawPath)
+	req.URL.RawPath = newVal(req.URL.EscapedPath())
 	if p, err := url.PathUnescape(req.URL.RawPath); err == nil && p != "" {
 		req.URL.Path = p
 	} else {
 		req.URL.Path = newVal(req.URL.Path)
 	}
+	// RawPath is only set if it's different from the normalized Path (std lib)
+	if req.URL.RawPath == req.URL.Path {
+		req.URL.RawPath = ""
+	}
 }
 
 // Interface guard
diff --git a/modules/caddyhttp/rewrite/rewrite_test.go b/modules/caddyhttp/rewrite/rewrite_test.go
index 84dce95..bc20c85 100644
--- a/modules/caddyhttp/rewrite/rewrite_test.go
+++ b/modules/caddyhttp/rewrite/rewrite_test.go
@@ -235,6 +235,42 @@ func TestRewrite(t *testing.T) {
 			input:  newRequest(t, "GET", "/foo/prefix/bar"),
 			expect: newRequest(t, "GET", "/foo/prefix/bar"),
 		},
+		{
+			rule: Rewrite{StripPathPrefix: "//prefix"},
+			// scheme and host needed for URL parser to succeed in setting up test
+			input:  newRequest(t, "GET", "http://host//prefix/foo/bar"),
+			expect: newRequest(t, "GET", "http://host/foo/bar"),
+		},
+		{
+			rule:   Rewrite{StripPathPrefix: "//prefix"},
+			input:  newRequest(t, "GET", "/prefix/foo/bar"),
+			expect: newRequest(t, "GET", "/prefix/foo/bar"),
+		},
+		{
+			rule:   Rewrite{StripPathPrefix: "/a%2Fb/c"},
+			input:  newRequest(t, "GET", "/a%2Fb/c/d"),
+			expect: newRequest(t, "GET", "/d"),
+		},
+		{
+			rule:   Rewrite{StripPathPrefix: "/a%2Fb/c"},
+			input:  newRequest(t, "GET", "/a%2fb/c/d"),
+			expect: newRequest(t, "GET", "/d"),
+		},
+		{
+			rule:   Rewrite{StripPathPrefix: "/a/b/c"},
+			input:  newRequest(t, "GET", "/a%2Fb/c/d"),
+			expect: newRequest(t, "GET", "/d"),
+		},
+		{
+			rule:   Rewrite{StripPathPrefix: "/a%2Fb/c"},
+			input:  newRequest(t, "GET", "/a/b/c/d"),
+			expect: newRequest(t, "GET", "/a/b/c/d"),
+		},
+		{
+			rule:   Rewrite{StripPathPrefix: "//a%2Fb/c"},
+			input:  newRequest(t, "GET", "/a/b/c/d"),
+			expect: newRequest(t, "GET", "/a/b/c/d"),
+		},
 
 		{
 			rule:   Rewrite{StripPathSuffix: "/suffix"},
@@ -252,6 +288,11 @@ func TestRewrite(t *testing.T) {
 			expect: newRequest(t, "GET", "/foo%2Fbar/"),
 		},
 		{
+			rule:   Rewrite{StripPathSuffix: "%2fsuffix"},
+			input:  newRequest(t, "GET", "/foo%2Fbar%2fsuffix"),
+			expect: newRequest(t, "GET", "/foo%2Fbar"),
+		},
+		{
 			rule:   Rewrite{StripPathSuffix: "/suffix"},
 			input:  newRequest(t, "GET", "/foo/suffix/bar"),
 			expect: newRequest(t, "GET", "/foo/suffix/bar"),
author	Matt Holt <mholt@users.noreply.github.com>	2022-08-16 08:48:57 -0600
committer	GitHub <noreply@github.com>	2022-08-16 08:48:57 -0600
commit	a479943acd70068c4b80d3a8f4b8dd7ab93ca2ba (patch)
tree	f50a45b3b0c8c8475b783583967c1175f5e6673c
parent	dc62d468e9645f52a5e1b4f6093dff65137ab3fe (diff)