caddyfile: Implement heredoc support (#5385)

author: Francis Lavoie <lavofr@gmail.com> 2023-02-25 19:34:27 -0500
committer: GitHub <noreply@github.com> 2023-02-26 00:34:27 +0000
commit: 960150bb034dc9a549ee7289b1a4eb4abafeb30a (patch)
tree: a3608546db0b154a75afc3dae2d52d44b92ceac9 /caddyconfig
parent: 9e6919550be5689628d0020ec14e90ea6f527716 (diff)
5 files changed, 324 insertions, 89 deletions
diff --git a/caddyconfig/caddyfile/dispenser.go b/caddyconfig/caddyfile/dispenser.go
index 91bd9a5..0daed3c 100644
--- a/caddyconfig/caddyfile/dispenser.go
+++ b/caddyconfig/caddyfile/dispenser.go
@@ -20,7 +20,6 @@ import (
 	"io"
 	"log"
 	"strconv"
-	"strings"
 )
 
 // Dispenser is a type that dispenses tokens, similarly to a lexer,
@@ -101,12 +100,12 @@ func (d *Dispenser) nextOnSameLine() bool {
 		d.cursor++
 		return true
 	}
-	if d.cursor >= len(d.tokens) {
+	if d.cursor >= len(d.tokens)-1 {
 		return false
 	}
-	if d.cursor < len(d.tokens)-1 &&
-		d.tokens[d.cursor].File == d.tokens[d.cursor+1].File &&
-		d.tokens[d.cursor].Line+d.numLineBreaks(d.cursor) == d.tokens[d.cursor+1].Line {
+	curr := d.tokens[d.cursor]
+	next := d.tokens[d.cursor+1]
+	if curr.File == next.File && curr.Line+curr.NumLineBreaks() == next.Line {
 		d.cursor++
 		return true
 	}
@@ -122,12 +121,12 @@ func (d *Dispenser) NextLine() bool {
 		d.cursor++
 		return true
 	}
-	if d.cursor >= len(d.tokens) {
+	if d.cursor >= len(d.tokens)-1 {
 		return false
 	}
-	if d.cursor < len(d.tokens)-1 &&
-		(d.tokens[d.cursor].File != d.tokens[d.cursor+1].File ||
-			d.tokens[d.cursor].Line+d.numLineBreaks(d.cursor) < d.tokens[d.cursor+1].Line) {
+	curr := d.tokens[d.cursor]
+	next := d.tokens[d.cursor+1]
+	if curr.File != next.File || curr.Line+curr.NumLineBreaks() < next.Line {
 		d.cursor++
 		return true
 	}
@@ -203,14 +202,17 @@ func (d *Dispenser) Val() string {
 }
 
 // ValRaw gets the raw text of the current token (including quotes).
+// If the token was a heredoc, then the delimiter is not included,
+// because that is not relevant to any unmarshaling logic at this time.
 // If there is no token loaded, it returns empty string.
 func (d *Dispenser) ValRaw() string {
 	if d.cursor < 0 || d.cursor >= len(d.tokens) {
 		return ""
 	}
 	quote := d.tokens[d.cursor].wasQuoted
-	if quote > 0 {
-		return string(quote) + d.tokens[d.cursor].Text + string(quote) // string literal
+	if quote > 0 && quote != '<' {
+		// string literal
+		return string(quote) + d.tokens[d.cursor].Text + string(quote)
 	}
 	return d.tokens[d.cursor].Text
 }
@@ -438,14 +440,14 @@ func (d *Dispenser) Delete() []Token {
 	return d.tokens
 }
 
-// numLineBreaks counts how many line breaks are in the token
-// value given by the token index tknIdx. It returns 0 if the
-// token does not exist or there are no line breaks.
-func (d *Dispenser) numLineBreaks(tknIdx int) int {
-	if tknIdx < 0 || tknIdx >= len(d.tokens) {
-		return 0
+// DeleteN is the same as Delete, but can delete many tokens at once.
+// If there aren't N tokens available to delete, none are deleted.
+func (d *Dispenser) DeleteN(amount int) []Token {
+	if amount > 0 && d.cursor >= (amount-1) && d.cursor <= len(d.tokens)-1 {
+		d.tokens = append(d.tokens[:d.cursor-(amount-1)], d.tokens[d.cursor+1:]...)
+		d.cursor -= amount
 	}
-	return strings.Count(d.tokens[tknIdx].Text, "\n")
+	return d.tokens
 }
 
 // isNewLine determines whether the current token is on a different
@@ -468,18 +470,10 @@ func (d *Dispenser) isNewLine() bool {
 		return true
 	}
 
-	// The previous token may contain line breaks if
-	// it was quoted and spanned multiple lines. e.g:
-	//
-	// dir "foo
-	//   bar
-	//   baz"
-	prevLineBreaks := d.numLineBreaks(d.cursor - 1)
-
 	// If the previous token (incl line breaks) ends
 	// on a line earlier than the current token,
 	// then the current token is on a new line
-	return prev.Line+prevLineBreaks < curr.Line
+	return prev.Line+prev.NumLineBreaks() < curr.Line
 }
 
 // isNextOnNewLine determines whether the current token is on a different
@@ -502,16 +496,8 @@ func (d *Dispenser) isNextOnNewLine() bool {
 		return true
 	}
 
-	// The current token may contain line breaks if
-	// it was quoted and spanned multiple lines. e.g:
-	//
-	// dir "foo
-	//   bar
-	//   baz"
-	currLineBreaks := d.numLineBreaks(d.cursor)
-
 	// If the current token (incl line breaks) ends
 	// on a line earlier than the next token,
 	// then the next token is on a new line
-	return curr.Line+currLineBreaks < next.Line
+	return curr.Line+curr.NumLineBreaks() < next.Line
 }
diff --git a/caddyconfig/caddyfile/lexer.go b/caddyconfig/caddyfile/lexer.go
index 09b04c1..ba8b879 100644
--- a/caddyconfig/caddyfile/lexer.go
+++ b/caddyconfig/caddyfile/lexer.go
@@ -17,7 +17,10 @@ package caddyfile
 import (
 	"bufio"
 	"bytes"
+	"fmt"
 	"io"
+	"regexp"
+	"strings"
 	"unicode"
 )
 
@@ -35,30 +38,39 @@ type (
 
 	// Token represents a single parsable unit.
 	Token struct {
-		File        string
-		origFile    string
-		Line        int
-		Text        string
-		wasQuoted   rune // enclosing quote character, if any
-		snippetName string
+		File          string
+		origFile      string
+		Line          int
+		Text          string
+		wasQuoted     rune // enclosing quote character, if any
+		heredocMarker string
+		snippetName   string
 	}
 )
 
-// originalFile gets original filename before import modification.
-func (t Token) originalFile() string {
-	if t.origFile != "" {
-		return t.origFile
+// Tokenize takes bytes as input and lexes it into
+// a list of tokens that can be parsed as a Caddyfile.
+// Also takes a filename to fill the token's File as
+// the source of the tokens, which is important to
+// determine relative paths for `import` directives.
+func Tokenize(input []byte, filename string) ([]Token, error) {
+	l := lexer{}
+	if err := l.load(bytes.NewReader(input)); err != nil {
+		return nil, err
 	}
-	return t.File
-}
-
-// updateFile updates the token's source filename for error display
-// and remembers the original filename. Used during "import" processing.
-func (t *Token) updateFile(file string) {
-	if t.origFile == "" {
-		t.origFile = t.File
+	var tokens []Token
+	for {
+		found, err := l.next()
+		if err != nil {
+			return nil, err
+		}
+		if !found {
+			break
+		}
+		l.token.File = filename
+		tokens = append(tokens, l.token)
 	}
-	t.File = file
+	return tokens, nil
 }
 
 // load prepares the lexer to scan an input for tokens.
@@ -92,28 +104,93 @@ func (l *lexer) load(input io.Reader) error {
 // may be escaped. The rest of the line is skipped
 // if a "#" character is read in. Returns true if
 // a token was loaded; false otherwise.
-func (l *lexer) next() bool {
+func (l *lexer) next() (bool, error) {
 	var val []rune
-	var comment, quoted, btQuoted, escaped bool
+	var comment, quoted, btQuoted, inHeredoc, heredocEscaped, escaped bool
+	var heredocMarker string
 
 	makeToken := func(quoted rune) bool {
 		l.token.Text = string(val)
 		l.token.wasQuoted = quoted
+		l.token.heredocMarker = heredocMarker
 		return true
 	}
 
 	for {
+		// Read a character in; if err then if we had
+		// read some characters, make a token. If we
+		// reached EOF, then no more tokens to read.
+		// If no EOF, then we had a problem.
 		ch, _, err := l.reader.ReadRune()
 		if err != nil {
 			if len(val) > 0 {
-				return makeToken(0)
+				if inHeredoc {
+					return false, fmt.Errorf("incomplete heredoc <<%s on line #%d, expected ending marker %s", heredocMarker, l.line+l.skippedLines, heredocMarker)
+				}
+
+				return makeToken(0), nil
 			}
 			if err == io.EOF {
-				return false
+				return false, nil
 			}
-			panic(err)
+			return false, err
 		}
 
+		// detect whether we have the start of a heredoc
+		if !inHeredoc && !heredocEscaped && len(val) > 1 && string(val[:2]) == "<<" {
+			if ch == '<' {
+				return false, fmt.Errorf("too many '<' for heredoc on line #%d; only use two, for example <<END", l.line)
+			}
+			if ch == '\r' {
+				continue
+			}
+			// after hitting a newline, we know that the heredoc marker
+			// is the characters after the two << and the newline.
+			// we reset the val because the heredoc is syntax we don't
+			// want to keep.
+			if ch == '\n' {
+				heredocMarker = string(val[2:])
+				if !heredocMarkerRegexp.Match([]byte(heredocMarker)) {
+					return false, fmt.Errorf("heredoc marker on line #%d must contain only alpha-numeric characters, dashes and underscores; got '%s'", l.line, heredocMarker)
+				}
+
+				inHeredoc = true
+				l.skippedLines++
+				val = nil
+				continue
+			}
+			val = append(val, ch)
+			continue
+		}
+
+		// if we're in a heredoc, all characters are read as-is
+		if inHeredoc {
+			val = append(val, ch)
+
+			if ch == '\n' {
+				l.skippedLines++
+			}
+
+			// check if we're done, i.e. that the last few characters are the marker
+			if len(val) > len(heredocMarker) && heredocMarker == string(val[len(val)-len(heredocMarker):]) {
+				// set the final value
+				val, err = l.finalizeHeredoc(val, heredocMarker)
+				if err != nil {
+					return false, err
+				}
+
+				// set the line counter, and make the token
+				l.line += l.skippedLines
+				l.skippedLines = 0
+				return makeToken('<'), nil
+			}
+
+			// stay in the heredoc until we find the ending marker
+			continue
+		}
+
+		// track whether we found an escape '\' for the next
+		// iteration to be contextually aware
 		if !escaped && !btQuoted && ch == '\\' {
 			escaped = true
 			continue
@@ -128,26 +205,29 @@ func (l *lexer) next() bool {
 				}
 				escaped = false
 			} else {
-				if quoted && ch == '"' {
-					return makeToken('"')
-				}
-				if btQuoted && ch == '`' {
-					return makeToken('`')
+				if (quoted && ch == '"') || (btQuoted && ch == '`') {
+					return makeToken(ch), nil
 				}
 			}
+			// allow quoted text to wrap continue on multiple lines
 			if ch == '\n' {
 				l.line += 1 + l.skippedLines
 				l.skippedLines = 0
 			}
+			// collect this character as part of the quoted token
 			val = append(val, ch)
 			continue
 		}
 
 		if unicode.IsSpace(ch) {
+			// ignore CR altogether, we only actually care about LF (\n)
 			if ch == '\r' {
 				continue
 			}
+			// end of the line
 			if ch == '\n' {
+				// newlines can be escaped to chain arguments
+				// onto multiple lines; else, increment the line count
 				if escaped {
 					l.skippedLines++
 					escaped = false
@@ -155,14 +235,18 @@ func (l *lexer) next() bool {
 					l.line += 1 + l.skippedLines
 					l.skippedLines = 0
 				}
+				// comments (#) are single-line only
 				comment = false
 			}
+			// any kind of space means we're at the end of this token
 			if len(val) > 0 {
-				return makeToken(0)
+				return makeToken(0), nil
 			}
 			continue
 		}
 
+		// comments must be at the start of a token,
+		// in other words, preceded by space or newline
 		if ch == '#' && len(val) == 0 {
 			comment = true
 		}
@@ -183,7 +267,12 @@ func (l *lexer) next() bool {
 		}
 
 		if escaped {
-			val = append(val, '\\')
+			// allow escaping the first < to skip the heredoc syntax
+			if ch == '<' {
+				heredocEscaped = true
+			} else {
+				val = append(val, '\\')
+			}
 			escaped = false
 		}
 
@@ -191,24 +280,71 @@ func (l *lexer) next() bool {
 	}
 }
 
-// Tokenize takes bytes as input and lexes it into
-// a list of tokens that can be parsed as a Caddyfile.
-// Also takes a filename to fill the token's File as
-// the source of the tokens, which is important to
-// determine relative paths for `import` directives.
-func Tokenize(input []byte, filename string) ([]Token, error) {
-	l := lexer{}
-	if err := l.load(bytes.NewReader(input)); err != nil {
-		return nil, err
+// finalizeHeredoc takes the runes read as the heredoc text and the marker,
+// and processes the text to strip leading whitespace, returning the final
+// value without the leading whitespace.
+func (l *lexer) finalizeHeredoc(val []rune, marker string) ([]rune, error) {
+	// find the last newline of the heredoc, which is where the contents end
+	lastNewline := strings.LastIndex(string(val), "\n")
+
+	// collapse the content, then split into separate lines
+	lines := strings.Split(string(val[:lastNewline+1]), "\n")
+
+	// figure out how much whitespace we need to strip from the front of every line
+	// by getting the string that precedes the marker, on the last line
+	paddingToStrip := string(val[lastNewline+1 : len(val)-len(marker)])
+
+	// iterate over each line and strip the whitespace from the front
+	var out string
+	for lineNum, lineText := range lines[:len(lines)-1] {
+		// find an exact match for the padding
+		index := strings.Index(lineText, paddingToStrip)
+
+		// if the padding doesn't match exactly at the start then we can't safely strip
+		if index != 0 {
+			return nil, fmt.Errorf("mismatched leading whitespace in heredoc <<%s on line #%d [%s], expected whitespace [%s] to match the closing marker", marker, l.line+lineNum+1, lineText, paddingToStrip)
+		}
+
+		// strip, then append the line, with the newline, to the output.
+		// also removes all "\r" because Windows.
+		out += strings.ReplaceAll(lineText[len(paddingToStrip):]+"\n", "\r", "")
 	}
-	var tokens []Token
-	for l.next() {
-		l.token.File = filename
-		tokens = append(tokens, l.token)
+
+	// return the final value
+	return []rune(out), nil
+}
+
+// originalFile gets original filename before import modification.
+func (t Token) originalFile() string {
+	if t.origFile != "" {
+		return t.origFile
 	}
-	return tokens, nil
+	return t.File
+}
+
+// updateFile updates the token's source filename for error display
+// and remembers the original filename. Used during "import" processing.
+func (t *Token) updateFile(file string) {
+	if t.origFile == "" {
+		t.origFile = t.File
+	}
+	t.File = file
 }
 
 func (t Token) Quoted() bool {
 	return t.wasQuoted > 0
 }
+
+// NumLineBreaks counts how many line breaks are in the token text.
+func (t Token) NumLineBreaks() int {
+	lineBreaks := strings.Count(t.Text, "\n")
+	if t.wasQuoted == '<' {
+		// heredocs have an extra linebreak because the opening
+		// delimiter is on its own line and is not included in
+		// the token Text itself
+		lineBreaks++
+	}
+	return lineBreaks
+}
+
+var heredocMarkerRegexp = regexp.MustCompile("^[A-Za-z0-9_-]+$")
diff --git a/caddyconfig/caddyfile/lexer_test.go b/caddyconfig/caddyfile/lexer_test.go
index 30ee0f6..3c7e157 100644
--- a/caddyconfig/caddyfile/lexer_test.go
+++ b/caddyconfig/caddyfile/lexer_test.go
@@ -18,13 +18,13 @@ import (
 	"testing"
 )
 
-type lexerTestCase struct {
-	input    []byte
-	expected []Token
-}
-
 func TestLexer(t *testing.T) {
-	testCases := []lexerTestCase{
+	testCases := []struct {
+		input        []byte
+		expected     []Token
+		expectErr    bool
+		errorMessage string
+	}{
 		{
 			input: []byte(`host:123`),
 			expected: []Token{
@@ -249,10 +249,123 @@ func TestLexer(t *testing.T) {
 				{Line: 1, Text: `quotes`},
 			},
 		},
+		{
+			input: []byte(`heredoc <<EOF
+content
+EOF same-line-arg
+	`),
+			expected: []Token{
+				{Line: 1, Text: `heredoc`},
+				{Line: 1, Text: "content\n"},
+				{Line: 3, Text: `same-line-arg`},
+			},
+		},
+		{
+			input: []byte(`heredoc <<VERY-LONG-MARKER
+content
+VERY-LONG-MARKER same-line-arg
+	`),
+			expected: []Token{
+				{Line: 1, Text: `heredoc`},
+				{Line: 1, Text: "content\n"},
+				{Line: 3, Text: `same-line-arg`},
+			},
+		},
+		{
+			input: []byte(`heredoc <<EOF
+	content
+	EOF same-line-arg
+	`),
+			expected: []Token{
+				{Line: 1, Text: `heredoc`},
+				{Line: 1, Text: "content\n"},
+				{Line: 3, Text: `same-line-arg`},
+			},
+		},
+		{
+			input: []byte(`prev-line
+	heredoc <<EOF
+		multi
+		line
+		content
+	EOF same-line-arg
+	next-line
+	`),
+			expected: []Token{
+				{Line: 1, Text: `prev-line`},
+				{Line: 2, Text: `heredoc`},
+				{Line: 2, Text: "\tmulti\n\tline\n\tcontent\n"},
+				{Line: 6, Text: `same-line-arg`},
+				{Line: 7, Text: `next-line`},
+			},
+		},
+		{
+			input: []byte(`heredoc <EOF
+	content
+	EOF same-line-arg
+	`),
+			expected: []Token{
+				{Line: 1, Text: `heredoc`},
+				{Line: 1, Text: `<EOF`},
+				{Line: 2, Text: `content`},
+				{Line: 3, Text: `EOF`},
+				{Line: 3, Text: `same-line-arg`},
+			},
+		},
+		{
+			input: []byte(`heredoc <<HERE SAME LINE
+	content
+	HERE same-line-arg
+	`),
+			expectErr:    true,
+			errorMessage: "heredoc marker on line #1 must contain only alpha-numeric characters, dashes and underscores; got 'HERE SAME LINE'",
+		},
+		{
+			input: []byte(`heredoc <<<EOF
+	content
+	EOF same-line-arg
+	`),
+			expectErr:    true,
+			errorMessage: "too many '<' for heredoc on line #1; only use two, for example <<END",
+		},
+		{
+			input: []byte(`heredoc <<EOF
+	content
+	`),
+			expectErr:    true,
+			errorMessage: "incomplete heredoc <<EOF on line #3, expected ending marker EOF",
+		},
+		{
+			input: []byte(`heredoc <<EOF
+	content
+		EOF
+	`),
+			expectErr:    true,
+			errorMessage: "mismatched leading whitespace in heredoc <<EOF on line #2 [\tcontent], expected whitespace [\t\t] to match the closing marker",
+		},
+		{
+			input: []byte(`heredoc <<EOF
+        content
+		EOF
+	`),
+			expectErr:    true,
+			errorMessage: "mismatched leading whitespace in heredoc <<EOF on line #2 [        content], expected whitespace [\t\t] to match the closing marker",
+		},
 	}
 
 	for i, testCase := range testCases {
 		actual, err := Tokenize(testCase.input, "")
+		if testCase.expectErr {
+			if err == nil {
+				t.Errorf("expected error, got actual: %v", actual)
+				continue
+			}
+			if err.Error() != testCase.errorMessage {
+				t.Errorf("expected error '%v', got: %v", testCase.errorMessage, err)
+			}
+			continue
+		}
+
 		if err != nil {
 			t.Errorf("%v", err)
 		}
diff --git a/caddyconfig/caddyfile/testdata/import_args0.txt b/caddyconfig/caddyfile/testdata/import_args0.txt
index af946fe..add211e 100644
--- a/caddyconfig/caddyfile/testdata/import_args0.txt
+++ b/caddyconfig/caddyfile/testdata/import_args0.txt
@@ -1 +1 @@
-{args.0}
-\ No newline at end of file
+{args[0]}
+\ No newline at end of file
diff --git a/caddyconfig/caddyfile/testdata/import_args1.txt b/caddyconfig/caddyfile/testdata/import_args1.txt
index 519a92d..422692a 100644
--- a/caddyconfig/caddyfile/testdata/import_args1.txt
+++ b/caddyconfig/caddyfile/testdata/import_args1.txt
@@ -1 +1 @@
-{args.0} {args.1}
-\ No newline at end of file
+{args[0]} {args[1]}
+\ No newline at end of file
author	Francis Lavoie <lavofr@gmail.com>	2023-02-25 19:34:27 -0500
committer	GitHub <noreply@github.com>	2023-02-26 00:34:27 +0000
commit	960150bb034dc9a549ee7289b1a4eb4abafeb30a (patch)
tree	a3608546db0b154a75afc3dae2d52d44b92ceac9 /caddyconfig
parent	9e6919550be5689628d0020ec14e90ea6f527716 (diff)