From 960150bb034dc9a549ee7289b1a4eb4abafeb30a Mon Sep 17 00:00:00 2001
From: Francis Lavoie <lavofr@gmail.com>
Date: Sat, 25 Feb 2023 19:34:27 -0500
Subject: caddyfile: Implement heredoc support (#5385)

---
 caddyconfig/caddyfile/lexer.go | 226 +++++++++++++++++++++++++++++++++--------
 1 file changed, 181 insertions(+), 45 deletions(-)

(limited to 'caddyconfig/caddyfile/lexer.go')

diff --git a/caddyconfig/caddyfile/lexer.go b/caddyconfig/caddyfile/lexer.go
index 09b04c1..ba8b879 100644
--- a/caddyconfig/caddyfile/lexer.go
+++ b/caddyconfig/caddyfile/lexer.go
@@ -17,7 +17,10 @@ package caddyfile
 import (
 	"bufio"
 	"bytes"
+	"fmt"
 	"io"
+	"regexp"
+	"strings"
 	"unicode"
 )
 
@@ -35,30 +38,39 @@ type (
 
 	// Token represents a single parsable unit.
 	Token struct {
-		File        string
-		origFile    string
-		Line        int
-		Text        string
-		wasQuoted   rune // enclosing quote character, if any
-		snippetName string
+		File          string
+		origFile      string
+		Line          int
+		Text          string
+		wasQuoted     rune // enclosing quote character, if any
+		heredocMarker string
+		snippetName   string
 	}
 )
 
-// originalFile gets original filename before import modification.
-func (t Token) originalFile() string {
-	if t.origFile != "" {
-		return t.origFile
+// Tokenize takes bytes as input and lexes it into
+// a list of tokens that can be parsed as a Caddyfile.
+// Also takes a filename to fill the token's File as
+// the source of the tokens, which is important to
+// determine relative paths for `import` directives.
+func Tokenize(input []byte, filename string) ([]Token, error) {
+	l := lexer{}
+	if err := l.load(bytes.NewReader(input)); err != nil {
+		return nil, err
 	}
-	return t.File
-}
-
-// updateFile updates the token's source filename for error display
-// and remembers the original filename. Used during "import" processing.
-func (t *Token) updateFile(file string) {
-	if t.origFile == "" {
-		t.origFile = t.File
+	var tokens []Token
+	for {
+		found, err := l.next()
+		if err != nil {
+			return nil, err
+		}
+		if !found {
+			break
+		}
+		l.token.File = filename
+		tokens = append(tokens, l.token)
 	}
-	t.File = file
+	return tokens, nil
 }
 
 // load prepares the lexer to scan an input for tokens.
@@ -92,28 +104,93 @@ func (l *lexer) load(input io.Reader) error {
 // may be escaped. The rest of the line is skipped
 // if a "#" character is read in. Returns true if
 // a token was loaded; false otherwise.
-func (l *lexer) next() bool {
+func (l *lexer) next() (bool, error) {
 	var val []rune
-	var comment, quoted, btQuoted, escaped bool
+	var comment, quoted, btQuoted, inHeredoc, heredocEscaped, escaped bool
+	var heredocMarker string
 
 	makeToken := func(quoted rune) bool {
 		l.token.Text = string(val)
 		l.token.wasQuoted = quoted
+		l.token.heredocMarker = heredocMarker
 		return true
 	}
 
 	for {
+		// Read a character in; if err then if we had
+		// read some characters, make a token. If we
+		// reached EOF, then no more tokens to read.
+		// If no EOF, then we had a problem.
 		ch, _, err := l.reader.ReadRune()
 		if err != nil {
 			if len(val) > 0 {
-				return makeToken(0)
+				if inHeredoc {
+					return false, fmt.Errorf("incomplete heredoc <<%s on line #%d, expected ending marker %s", heredocMarker, l.line+l.skippedLines, heredocMarker)
+				}
+
+				return makeToken(0), nil
 			}
 			if err == io.EOF {
-				return false
+				return false, nil
 			}
-			panic(err)
+			return false, err
 		}
 
+		// detect whether we have the start of a heredoc
+		if !inHeredoc && !heredocEscaped && len(val) > 1 && string(val[:2]) == "<<" {
+			if ch == '<' {
+				return false, fmt.Errorf("too many '<' for heredoc on line #%d; only use two, for example <<END", l.line)
+			}
+			if ch == '\r' {
+				continue
+			}
+			// after hitting a newline, we know that the heredoc marker
+			// is the characters after the two << and the newline.
+			// we reset the val because the heredoc is syntax we don't
+			// want to keep.
+			if ch == '\n' {
+				heredocMarker = string(val[2:])
+				if !heredocMarkerRegexp.Match([]byte(heredocMarker)) {
+					return false, fmt.Errorf("heredoc marker on line #%d must contain only alpha-numeric characters, dashes and underscores; got '%s'", l.line, heredocMarker)
+				}
+
+				inHeredoc = true
+				l.skippedLines++
+				val = nil
+				continue
+			}
+			val = append(val, ch)
+			continue
+		}
+
+		// if we're in a heredoc, all characters are read as-is
+		if inHeredoc {
+			val = append(val, ch)
+
+			if ch == '\n' {
+				l.skippedLines++
+			}
+
+			// check if we're done, i.e. that the last few characters are the marker
+			if len(val) > len(heredocMarker) && heredocMarker == string(val[len(val)-len(heredocMarker):]) {
+				// set the final value
+				val, err = l.finalizeHeredoc(val, heredocMarker)
+				if err != nil {
+					return false, err
+				}
+
+				// set the line counter, and make the token
+				l.line += l.skippedLines
+				l.skippedLines = 0
+				return makeToken('<'), nil
+			}
+
+			// stay in the heredoc until we find the ending marker
+			continue
+		}
+
+		// track whether we found an escape '\' for the next
+		// iteration to be contextually aware
 		if !escaped && !btQuoted && ch == '\\' {
 			escaped = true
 			continue
@@ -128,26 +205,29 @@ func (l *lexer) next() bool {
 				}
 				escaped = false
 			} else {
-				if quoted && ch == '"' {
-					return makeToken('"')
-				}
-				if btQuoted && ch == '`' {
-					return makeToken('`')
+				if (quoted && ch == '"') || (btQuoted && ch == '`') {
+					return makeToken(ch), nil
 				}
 			}
+			// allow quoted text to wrap continue on multiple lines
 			if ch == '\n' {
 				l.line += 1 + l.skippedLines
 				l.skippedLines = 0
 			}
+			// collect this character as part of the quoted token
 			val = append(val, ch)
 			continue
 		}
 
 		if unicode.IsSpace(ch) {
+			// ignore CR altogether, we only actually care about LF (\n)
 			if ch == '\r' {
 				continue
 			}
+			// end of the line
 			if ch == '\n' {
+				// newlines can be escaped to chain arguments
+				// onto multiple lines; else, increment the line count
 				if escaped {
 					l.skippedLines++
 					escaped = false
@@ -155,14 +235,18 @@ func (l *lexer) next() bool {
 					l.line += 1 + l.skippedLines
 					l.skippedLines = 0
 				}
+				// comments (#) are single-line only
 				comment = false
 			}
+			// any kind of space means we're at the end of this token
 			if len(val) > 0 {
-				return makeToken(0)
+				return makeToken(0), nil
 			}
 			continue
 		}
 
+		// comments must be at the start of a token,
+		// in other words, preceded by space or newline
 		if ch == '#' && len(val) == 0 {
 			comment = true
 		}
@@ -183,7 +267,12 @@ func (l *lexer) next() bool {
 		}
 
 		if escaped {
-			val = append(val, '\\')
+			// allow escaping the first < to skip the heredoc syntax
+			if ch == '<' {
+				heredocEscaped = true
+			} else {
+				val = append(val, '\\')
+			}
 			escaped = false
 		}
 
@@ -191,24 +280,71 @@ func (l *lexer) next() bool {
 	}
 }
 
-// Tokenize takes bytes as input and lexes it into
-// a list of tokens that can be parsed as a Caddyfile.
-// Also takes a filename to fill the token's File as
-// the source of the tokens, which is important to
-// determine relative paths for `import` directives.
-func Tokenize(input []byte, filename string) ([]Token, error) {
-	l := lexer{}
-	if err := l.load(bytes.NewReader(input)); err != nil {
-		return nil, err
+// finalizeHeredoc takes the runes read as the heredoc text and the marker,
+// and processes the text to strip leading whitespace, returning the final
+// value without the leading whitespace.
+func (l *lexer) finalizeHeredoc(val []rune, marker string) ([]rune, error) {
+	// find the last newline of the heredoc, which is where the contents end
+	lastNewline := strings.LastIndex(string(val), "\n")
+
+	// collapse the content, then split into separate lines
+	lines := strings.Split(string(val[:lastNewline+1]), "\n")
+
+	// figure out how much whitespace we need to strip from the front of every line
+	// by getting the string that precedes the marker, on the last line
+	paddingToStrip := string(val[lastNewline+1 : len(val)-len(marker)])
+
+	// iterate over each line and strip the whitespace from the front
+	var out string
+	for lineNum, lineText := range lines[:len(lines)-1] {
+		// find an exact match for the padding
+		index := strings.Index(lineText, paddingToStrip)
+
+		// if the padding doesn't match exactly at the start then we can't safely strip
+		if index != 0 {
+			return nil, fmt.Errorf("mismatched leading whitespace in heredoc <<%s on line #%d [%s], expected whitespace [%s] to match the closing marker", marker, l.line+lineNum+1, lineText, paddingToStrip)
+		}
+
+		// strip, then append the line, with the newline, to the output.
+		// also removes all "\r" because Windows.
+		out += strings.ReplaceAll(lineText[len(paddingToStrip):]+"\n", "\r", "")
 	}
-	var tokens []Token
-	for l.next() {
-		l.token.File = filename
-		tokens = append(tokens, l.token)
+
+	// return the final value
+	return []rune(out), nil
+}
+
+// originalFile gets original filename before import modification.
+func (t Token) originalFile() string {
+	if t.origFile != "" {
+		return t.origFile
 	}
-	return tokens, nil
+	return t.File
+}
+
+// updateFile updates the token's source filename for error display
+// and remembers the original filename. Used during "import" processing.
+func (t *Token) updateFile(file string) {
+	if t.origFile == "" {
+		t.origFile = t.File
+	}
+	t.File = file
 }
 
 func (t Token) Quoted() bool {
 	return t.wasQuoted > 0
 }
+
+// NumLineBreaks counts how many line breaks are in the token text.
+func (t Token) NumLineBreaks() int {
+	lineBreaks := strings.Count(t.Text, "\n")
+	if t.wasQuoted == '<' {
+		// heredocs have an extra linebreak because the opening
+		// delimiter is on its own line and is not included in
+		// the token Text itself
+		lineBreaks++
+	}
+	return lineBreaks
+}
+
+var heredocMarkerRegexp = regexp.MustCompile("^[A-Za-z0-9_-]+$")
-- 
cgit v1.2.3