vbump to v2.7.5

author: Tom Barrett <tom@tombarrett.xyz> 2023-11-01 17:57:48 +0100
committer: Tom Barrett <tom@tombarrett.xyz> 2023-11-01 18:11:33 +0100
commit: 240c3d1338415e5d82ef7ca0e52c4284be6441bd (patch)
tree: 4b0ee5d208c2cdffa78d65f1b0abe0ec85f15652 /caddyconfig/caddyfile/lexer.go
parent: 73e78ab226f21e6c6c68961af88c4ab9c746f4f4 (diff)
parent: 0e204b730aa2b1fa0835336b1117eff8c420f713 (diff)
1 files changed, 214 insertions, 32 deletions
diff --git a/caddyconfig/caddyfile/lexer.go b/caddyconfig/caddyfile/lexer.go
index 5605a6a..bfd6c0f 100644
--- a/caddyconfig/caddyfile/lexer.go
+++ b/caddyconfig/caddyfile/lexer.go
@@ -17,7 +17,10 @@ package caddyfile
 import (
 	"bufio"
 	"bytes"
+	"fmt"
 	"io"
+	"regexp"
+	"strings"
 	"unicode"
 )
 
@@ -35,15 +38,41 @@ type (
 
 	// Token represents a single parsable unit.
 	Token struct {
-		File        string
-		Line        int
-		Text        string
-		wasQuoted   rune // enclosing quote character, if any
-		inSnippet   bool
-		snippetName string
+		File          string
+		imports       []string
+		Line          int
+		Text          string
+		wasQuoted     rune // enclosing quote character, if any
+		heredocMarker string
+		snippetName   string
 	}
 )
 
+// Tokenize takes bytes as input and lexes it into
+// a list of tokens that can be parsed as a Caddyfile.
+// Also takes a filename to fill the token's File as
+// the source of the tokens, which is important to
+// determine relative paths for `import` directives.
+func Tokenize(input []byte, filename string) ([]Token, error) {
+	l := lexer{}
+	if err := l.load(bytes.NewReader(input)); err != nil {
+		return nil, err
+	}
+	var tokens []Token
+	for {
+		found, err := l.next()
+		if err != nil {
+			return nil, err
+		}
+		if !found {
+			break
+		}
+		l.token.File = filename
+		tokens = append(tokens, l.token)
+	}
+	return tokens, nil
+}
+
 // load prepares the lexer to scan an input for tokens.
 // It discards any leading byte order mark.
 func (l *lexer) load(input io.Reader) error {
@@ -75,28 +104,107 @@ func (l *lexer) load(input io.Reader) error {
 // may be escaped. The rest of the line is skipped
 // if a "#" character is read in. Returns true if
 // a token was loaded; false otherwise.
-func (l *lexer) next() bool {
+func (l *lexer) next() (bool, error) {
 	var val []rune
-	var comment, quoted, btQuoted, escaped bool
+	var comment, quoted, btQuoted, inHeredoc, heredocEscaped, escaped bool
+	var heredocMarker string
 
 	makeToken := func(quoted rune) bool {
 		l.token.Text = string(val)
 		l.token.wasQuoted = quoted
+		l.token.heredocMarker = heredocMarker
 		return true
 	}
 
 	for {
+		// Read a character in; if err then if we had
+		// read some characters, make a token. If we
+		// reached EOF, then no more tokens to read.
+		// If no EOF, then we had a problem.
 		ch, _, err := l.reader.ReadRune()
 		if err != nil {
 			if len(val) > 0 {
-				return makeToken(0)
+				if inHeredoc {
+					return false, fmt.Errorf("incomplete heredoc <<%s on line #%d, expected ending marker %s", heredocMarker, l.line+l.skippedLines, heredocMarker)
+				}
+
+				return makeToken(0), nil
 			}
 			if err == io.EOF {
-				return false
+				return false, nil
+			}
+			return false, err
+		}
+
+		// detect whether we have the start of a heredoc
+		if !(quoted || btQuoted) && !(inHeredoc || heredocEscaped) &&
+			len(val) > 1 && string(val[:2]) == "<<" {
+			// a space means it's just a regular token and not a heredoc
+			if ch == ' ' {
+				return makeToken(0), nil
+			}
+
+			// skip CR, we only care about LF
+			if ch == '\r' {
+				continue
+			}
+
+			// after hitting a newline, we know that the heredoc marker
+			// is the characters after the two << and the newline.
+			// we reset the val because the heredoc is syntax we don't
+			// want to keep.
+			if ch == '\n' {
+				if len(val) == 2 {
+					return false, fmt.Errorf("missing opening heredoc marker on line #%d; must contain only alpha-numeric characters, dashes and underscores; got empty string", l.line)
+				}
+
+				// check if there's too many <
+				if string(val[:3]) == "<<<" {
+					return false, fmt.Errorf("too many '<' for heredoc on line #%d; only use two, for example <<END", l.line)
+				}
+
+				heredocMarker = string(val[2:])
+				if !heredocMarkerRegexp.Match([]byte(heredocMarker)) {
+					return false, fmt.Errorf("heredoc marker on line #%d must contain only alpha-numeric characters, dashes and underscores; got '%s'", l.line, heredocMarker)
+				}
+
+				inHeredoc = true
+				l.skippedLines++
+				val = nil
+				continue
+			}
+			val = append(val, ch)
+			continue
+		}
+
+		// if we're in a heredoc, all characters are read as-is
+		if inHeredoc {
+			val = append(val, ch)
+
+			if ch == '\n' {
+				l.skippedLines++
+			}
+
+			// check if we're done, i.e. that the last few characters are the marker
+			if len(val) > len(heredocMarker) && heredocMarker == string(val[len(val)-len(heredocMarker):]) {
+				// set the final value
+				val, err = l.finalizeHeredoc(val, heredocMarker)
+				if err != nil {
+					return false, err
+				}
+
+				// set the line counter, and make the token
+				l.line += l.skippedLines
+				l.skippedLines = 0
+				return makeToken('<'), nil
 			}
-			panic(err)
+
+			// stay in the heredoc until we find the ending marker
+			continue
 		}
 
+		// track whether we found an escape '\' for the next
+		// iteration to be contextually aware
 		if !escaped && !btQuoted && ch == '\\' {
 			escaped = true
 			continue
@@ -111,26 +219,29 @@ func (l *lexer) next() bool {
 				}
 				escaped = false
 			} else {
-				if quoted && ch == '"' {
-					return makeToken('"')
-				}
-				if btQuoted && ch == '`' {
-					return makeToken('`')
+				if (quoted && ch == '"') || (btQuoted && ch == '`') {
+					return makeToken(ch), nil
 				}
 			}
+			// allow quoted text to wrap continue on multiple lines
 			if ch == '\n' {
 				l.line += 1 + l.skippedLines
 				l.skippedLines = 0
 			}
+			// collect this character as part of the quoted token
 			val = append(val, ch)
 			continue
 		}
 
 		if unicode.IsSpace(ch) {
+			// ignore CR altogether, we only actually care about LF (\n)
 			if ch == '\r' {
 				continue
 			}
+			// end of the line
 			if ch == '\n' {
+				// newlines can be escaped to chain arguments
+				// onto multiple lines; else, increment the line count
 				if escaped {
 					l.skippedLines++
 					escaped = false
@@ -138,14 +249,18 @@ func (l *lexer) next() bool {
 					l.line += 1 + l.skippedLines
 					l.skippedLines = 0
 				}
+				// comments (#) are single-line only
 				comment = false
 			}
+			// any kind of space means we're at the end of this token
 			if len(val) > 0 {
-				return makeToken(0)
+				return makeToken(0), nil
 			}
 			continue
 		}
 
+		// comments must be at the start of a token,
+		// in other words, preceded by space or newline
 		if ch == '#' && len(val) == 0 {
 			comment = true
 		}
@@ -166,7 +281,12 @@ func (l *lexer) next() bool {
 		}
 
 		if escaped {
-			val = append(val, '\\')
+			// allow escaping the first < to skip the heredoc syntax
+			if ch == '<' {
+				heredocEscaped = true
+			} else {
+				val = append(val, '\\')
+			}
 			escaped = false
 		}
 
@@ -174,24 +294,86 @@ func (l *lexer) next() bool {
 	}
 }
 
-// Tokenize takes bytes as input and lexes it into
-// a list of tokens that can be parsed as a Caddyfile.
-// Also takes a filename to fill the token's File as
-// the source of the tokens, which is important to
-// determine relative paths for `import` directives.
-func Tokenize(input []byte, filename string) ([]Token, error) {
-	l := lexer{}
-	if err := l.load(bytes.NewReader(input)); err != nil {
-		return nil, err
+// finalizeHeredoc takes the runes read as the heredoc text and the marker,
+// and processes the text to strip leading whitespace, returning the final
+// value without the leading whitespace.
+func (l *lexer) finalizeHeredoc(val []rune, marker string) ([]rune, error) {
+	stringVal := string(val)
+
+	// find the last newline of the heredoc, which is where the contents end
+	lastNewline := strings.LastIndex(stringVal, "\n")
+
+	// collapse the content, then split into separate lines
+	lines := strings.Split(stringVal[:lastNewline+1], "\n")
+
+	// figure out how much whitespace we need to strip from the front of every line
+	// by getting the string that precedes the marker, on the last line
+	paddingToStrip := stringVal[lastNewline+1 : len(stringVal)-len(marker)]
+
+	// iterate over each line and strip the whitespace from the front
+	var out string
+	for lineNum, lineText := range lines[:len(lines)-1] {
+		// find an exact match for the padding
+		index := strings.Index(lineText, paddingToStrip)
+
+		// if the padding doesn't match exactly at the start then we can't safely strip
+		if index != 0 {
+			return nil, fmt.Errorf("mismatched leading whitespace in heredoc <<%s on line #%d [%s], expected whitespace [%s] to match the closing marker", marker, l.line+lineNum+1, lineText, paddingToStrip)
+		}
+
+		// strip, then append the line, with the newline, to the output.
+		// also removes all "\r" because Windows.
+		out += strings.ReplaceAll(lineText[len(paddingToStrip):]+"\n", "\r", "")
 	}
-	var tokens []Token
-	for l.next() {
-		l.token.File = filename
-		tokens = append(tokens, l.token)
+
+	// Remove the trailing newline from the loop
+	if len(out) > 0 && out[len(out)-1] == '\n' {
+		out = out[:len(out)-1]
 	}
-	return tokens, nil
+
+	// return the final value
+	return []rune(out), nil
 }
 
 func (t Token) Quoted() bool {
 	return t.wasQuoted > 0
 }
+
+// NumLineBreaks counts how many line breaks are in the token text.
+func (t Token) NumLineBreaks() int {
+	lineBreaks := strings.Count(t.Text, "\n")
+	if t.wasQuoted == '<' {
+		// heredocs have an extra linebreak because the opening
+		// delimiter is on its own line and is not included in the
+		// token Text itself, and the trailing newline is removed.
+		lineBreaks += 2
+	}
+	return lineBreaks
+}
+
+var heredocMarkerRegexp = regexp.MustCompile("^[A-Za-z0-9_-]+$")
+
+// isNextOnNewLine tests whether t2 is on a different line from t1
+func isNextOnNewLine(t1, t2 Token) bool {
+	// If the second token is from a different file,
+	// we can assume it's from a different line
+	if t1.File != t2.File {
+		return true
+	}
+
+	// If the second token is from a different import chain,
+	// we can assume it's from a different line
+	if len(t1.imports) != len(t2.imports) {
+		return true
+	}
+	for i, im := range t1.imports {
+		if im != t2.imports[i] {
+			return true
+		}
+	}
+
+	// If the first token (incl line breaks) ends
+	// on a line earlier than the next token,
+	// then the second token is on a new line
+	return t1.Line+t1.NumLineBreaks() < t2.Line
+}
author	Tom Barrett <tom@tombarrett.xyz>	2023-11-01 17:57:48 +0100
committer	Tom Barrett <tom@tombarrett.xyz>	2023-11-01 18:11:33 +0100
commit	240c3d1338415e5d82ef7ca0e52c4284be6441bd (patch)
tree	4b0ee5d208c2cdffa78d65f1b0abe0ec85f15652 /caddyconfig/caddyfile/lexer.go
parent	73e78ab226f21e6c6c68961af88c4ab9c746f4f4 (diff)
parent	0e204b730aa2b1fa0835336b1117eff8c420f713 (diff)