diff options
author | Tom Barrett <tom@tombarrett.xyz> | 2023-11-01 17:57:48 +0100 |
---|---|---|
committer | Tom Barrett <tom@tombarrett.xyz> | 2023-11-01 18:11:33 +0100 |
commit | 240c3d1338415e5d82ef7ca0e52c4284be6441bd (patch) | |
tree | 4b0ee5d208c2cdffa78d65f1b0abe0ec85f15652 /caddyconfig/caddyfile/lexer.go | |
parent | 73e78ab226f21e6c6c68961af88c4ab9c746f4f4 (diff) | |
parent | 0e204b730aa2b1fa0835336b1117eff8c420f713 (diff) |
Diffstat (limited to 'caddyconfig/caddyfile/lexer.go')
-rw-r--r-- | caddyconfig/caddyfile/lexer.go | 246 |
1 files changed, 214 insertions, 32 deletions
diff --git a/caddyconfig/caddyfile/lexer.go b/caddyconfig/caddyfile/lexer.go index 5605a6a..bfd6c0f 100644 --- a/caddyconfig/caddyfile/lexer.go +++ b/caddyconfig/caddyfile/lexer.go @@ -17,7 +17,10 @@ package caddyfile import ( "bufio" "bytes" + "fmt" "io" + "regexp" + "strings" "unicode" ) @@ -35,15 +38,41 @@ type ( // Token represents a single parsable unit. Token struct { - File string - Line int - Text string - wasQuoted rune // enclosing quote character, if any - inSnippet bool - snippetName string + File string + imports []string + Line int + Text string + wasQuoted rune // enclosing quote character, if any + heredocMarker string + snippetName string } ) +// Tokenize takes bytes as input and lexes it into +// a list of tokens that can be parsed as a Caddyfile. +// Also takes a filename to fill the token's File as +// the source of the tokens, which is important to +// determine relative paths for `import` directives. +func Tokenize(input []byte, filename string) ([]Token, error) { + l := lexer{} + if err := l.load(bytes.NewReader(input)); err != nil { + return nil, err + } + var tokens []Token + for { + found, err := l.next() + if err != nil { + return nil, err + } + if !found { + break + } + l.token.File = filename + tokens = append(tokens, l.token) + } + return tokens, nil +} + // load prepares the lexer to scan an input for tokens. // It discards any leading byte order mark. func (l *lexer) load(input io.Reader) error { @@ -75,28 +104,107 @@ func (l *lexer) load(input io.Reader) error { // may be escaped. The rest of the line is skipped // if a "#" character is read in. Returns true if // a token was loaded; false otherwise. -func (l *lexer) next() bool { +func (l *lexer) next() (bool, error) { var val []rune - var comment, quoted, btQuoted, escaped bool + var comment, quoted, btQuoted, inHeredoc, heredocEscaped, escaped bool + var heredocMarker string makeToken := func(quoted rune) bool { l.token.Text = string(val) l.token.wasQuoted = quoted + l.token.heredocMarker = heredocMarker return true } for { + // Read a character in; if err then if we had + // read some characters, make a token. If we + // reached EOF, then no more tokens to read. + // If no EOF, then we had a problem. ch, _, err := l.reader.ReadRune() if err != nil { if len(val) > 0 { - return makeToken(0) + if inHeredoc { + return false, fmt.Errorf("incomplete heredoc <<%s on line #%d, expected ending marker %s", heredocMarker, l.line+l.skippedLines, heredocMarker) + } + + return makeToken(0), nil } if err == io.EOF { - return false + return false, nil + } + return false, err + } + + // detect whether we have the start of a heredoc + if !(quoted || btQuoted) && !(inHeredoc || heredocEscaped) && + len(val) > 1 && string(val[:2]) == "<<" { + // a space means it's just a regular token and not a heredoc + if ch == ' ' { + return makeToken(0), nil + } + + // skip CR, we only care about LF + if ch == '\r' { + continue + } + + // after hitting a newline, we know that the heredoc marker + // is the characters after the two << and the newline. + // we reset the val because the heredoc is syntax we don't + // want to keep. + if ch == '\n' { + if len(val) == 2 { + return false, fmt.Errorf("missing opening heredoc marker on line #%d; must contain only alpha-numeric characters, dashes and underscores; got empty string", l.line) + } + + // check if there's too many < + if string(val[:3]) == "<<<" { + return false, fmt.Errorf("too many '<' for heredoc on line #%d; only use two, for example <<END", l.line) + } + + heredocMarker = string(val[2:]) + if !heredocMarkerRegexp.Match([]byte(heredocMarker)) { + return false, fmt.Errorf("heredoc marker on line #%d must contain only alpha-numeric characters, dashes and underscores; got '%s'", l.line, heredocMarker) + } + + inHeredoc = true + l.skippedLines++ + val = nil + continue + } + val = append(val, ch) + continue + } + + // if we're in a heredoc, all characters are read as-is + if inHeredoc { + val = append(val, ch) + + if ch == '\n' { + l.skippedLines++ + } + + // check if we're done, i.e. that the last few characters are the marker + if len(val) > len(heredocMarker) && heredocMarker == string(val[len(val)-len(heredocMarker):]) { + // set the final value + val, err = l.finalizeHeredoc(val, heredocMarker) + if err != nil { + return false, err + } + + // set the line counter, and make the token + l.line += l.skippedLines + l.skippedLines = 0 + return makeToken('<'), nil } - panic(err) + + // stay in the heredoc until we find the ending marker + continue } + // track whether we found an escape '\' for the next + // iteration to be contextually aware if !escaped && !btQuoted && ch == '\\' { escaped = true continue @@ -111,26 +219,29 @@ func (l *lexer) next() bool { } escaped = false } else { - if quoted && ch == '"' { - return makeToken('"') - } - if btQuoted && ch == '`' { - return makeToken('`') + if (quoted && ch == '"') || (btQuoted && ch == '`') { + return makeToken(ch), nil } } + // allow quoted text to wrap continue on multiple lines if ch == '\n' { l.line += 1 + l.skippedLines l.skippedLines = 0 } + // collect this character as part of the quoted token val = append(val, ch) continue } if unicode.IsSpace(ch) { + // ignore CR altogether, we only actually care about LF (\n) if ch == '\r' { continue } + // end of the line if ch == '\n' { + // newlines can be escaped to chain arguments + // onto multiple lines; else, increment the line count if escaped { l.skippedLines++ escaped = false @@ -138,14 +249,18 @@ func (l *lexer) next() bool { l.line += 1 + l.skippedLines l.skippedLines = 0 } + // comments (#) are single-line only comment = false } + // any kind of space means we're at the end of this token if len(val) > 0 { - return makeToken(0) + return makeToken(0), nil } continue } + // comments must be at the start of a token, + // in other words, preceded by space or newline if ch == '#' && len(val) == 0 { comment = true } @@ -166,7 +281,12 @@ func (l *lexer) next() bool { } if escaped { - val = append(val, '\\') + // allow escaping the first < to skip the heredoc syntax + if ch == '<' { + heredocEscaped = true + } else { + val = append(val, '\\') + } escaped = false } @@ -174,24 +294,86 @@ func (l *lexer) next() bool { } } -// Tokenize takes bytes as input and lexes it into -// a list of tokens that can be parsed as a Caddyfile. -// Also takes a filename to fill the token's File as -// the source of the tokens, which is important to -// determine relative paths for `import` directives. -func Tokenize(input []byte, filename string) ([]Token, error) { - l := lexer{} - if err := l.load(bytes.NewReader(input)); err != nil { - return nil, err +// finalizeHeredoc takes the runes read as the heredoc text and the marker, +// and processes the text to strip leading whitespace, returning the final +// value without the leading whitespace. +func (l *lexer) finalizeHeredoc(val []rune, marker string) ([]rune, error) { + stringVal := string(val) + + // find the last newline of the heredoc, which is where the contents end + lastNewline := strings.LastIndex(stringVal, "\n") + + // collapse the content, then split into separate lines + lines := strings.Split(stringVal[:lastNewline+1], "\n") + + // figure out how much whitespace we need to strip from the front of every line + // by getting the string that precedes the marker, on the last line + paddingToStrip := stringVal[lastNewline+1 : len(stringVal)-len(marker)] + + // iterate over each line and strip the whitespace from the front + var out string + for lineNum, lineText := range lines[:len(lines)-1] { + // find an exact match for the padding + index := strings.Index(lineText, paddingToStrip) + + // if the padding doesn't match exactly at the start then we can't safely strip + if index != 0 { + return nil, fmt.Errorf("mismatched leading whitespace in heredoc <<%s on line #%d [%s], expected whitespace [%s] to match the closing marker", marker, l.line+lineNum+1, lineText, paddingToStrip) + } + + // strip, then append the line, with the newline, to the output. + // also removes all "\r" because Windows. + out += strings.ReplaceAll(lineText[len(paddingToStrip):]+"\n", "\r", "") } - var tokens []Token - for l.next() { - l.token.File = filename - tokens = append(tokens, l.token) + + // Remove the trailing newline from the loop + if len(out) > 0 && out[len(out)-1] == '\n' { + out = out[:len(out)-1] } - return tokens, nil + + // return the final value + return []rune(out), nil } func (t Token) Quoted() bool { return t.wasQuoted > 0 } + +// NumLineBreaks counts how many line breaks are in the token text. +func (t Token) NumLineBreaks() int { + lineBreaks := strings.Count(t.Text, "\n") + if t.wasQuoted == '<' { + // heredocs have an extra linebreak because the opening + // delimiter is on its own line and is not included in the + // token Text itself, and the trailing newline is removed. + lineBreaks += 2 + } + return lineBreaks +} + +var heredocMarkerRegexp = regexp.MustCompile("^[A-Za-z0-9_-]+$") + +// isNextOnNewLine tests whether t2 is on a different line from t1 +func isNextOnNewLine(t1, t2 Token) bool { + // If the second token is from a different file, + // we can assume it's from a different line + if t1.File != t2.File { + return true + } + + // If the second token is from a different import chain, + // we can assume it's from a different line + if len(t1.imports) != len(t2.imports) { + return true + } + for i, im := range t1.imports { + if im != t2.imports[i] { + return true + } + } + + // If the first token (incl line breaks) ends + // on a line earlier than the next token, + // then the second token is on a new line + return t1.Line+t1.NumLineBreaks() < t2.Line +} |