handlebars-go/lexer/lexer.go

640 lines
15 KiB
Go
Raw Normal View History

2023-09-10 10:55:27 +00:00
// Package lexer provides a handlebars tokenizer.
package lexer
import (
"fmt"
"regexp"
"strings"
"unicode"
"unicode/utf8"
)
// References:
// - https://github.com/wycats/handlebars.js/blob/master/src/handlebars.l
// - https://github.com/golang/go/blob/master/src/text/template/parse/lex.go
const (
// Mustaches detection
escapedEscapedOpenMustache = "\\\\{{"
escapedOpenMustache = "\\{{"
openMustache = "{{"
closeMustache = "}}"
closeStripMustache = "~}}"
closeUnescapedStripMustache = "}~}}"
)
const eof = -1
// lexFunc represents a function that returns the next lexer function.
type lexFunc func(*Lexer) lexFunc
// Lexer is a lexical analyzer.
type Lexer struct {
input string // input to scan
name string // lexer name, used for testing purpose
tokens chan Token // channel of scanned tokens
nextFunc lexFunc // the next function to execute
pos int // current byte position in input string
line int // current line position in input string
width int // size of last rune scanned from input string
start int // start position of the token we are scanning
// the shameful contextual properties needed because `nextFunc` is not enough
closeComment *regexp.Regexp // regexp to scan close of current comment
rawBlock bool // are we parsing a raw block content ?
}
var (
lookheadChars = `[\s` + regexp.QuoteMeta("=~}/)|") + `]`
literalLookheadChars = `[\s` + regexp.QuoteMeta("~})") + `]`
// characters not allowed in an identifier
unallowedIDChars = " \n\t!\"#%&'()*+,./;<=>@[\\]^`{|}~"
// regular expressions
rID = regexp.MustCompile(`^[^` + regexp.QuoteMeta(unallowedIDChars) + `]+`)
rDotID = regexp.MustCompile(`^\.` + lookheadChars)
rTrue = regexp.MustCompile(`^true` + literalLookheadChars)
rFalse = regexp.MustCompile(`^false` + literalLookheadChars)
rOpenRaw = regexp.MustCompile(`^\{\{\{\{`)
rCloseRaw = regexp.MustCompile(`^\}\}\}\}`)
rOpenEndRaw = regexp.MustCompile(`^\{\{\{\{/`)
rOpenEndRawLookAhead = regexp.MustCompile(`\{\{\{\{/`)
rOpenUnescaped = regexp.MustCompile(`^\{\{~?\{`)
rCloseUnescaped = regexp.MustCompile(`^\}~?\}\}`)
rOpenBlock = regexp.MustCompile(`^\{\{~?#`)
rOpenEndBlock = regexp.MustCompile(`^\{\{~?/`)
rOpenPartial = regexp.MustCompile(`^\{\{~?>`)
// {{^}} or {{else}}
rInverse = regexp.MustCompile(`^(\{\{~?\^\s*~?\}\}|\{\{~?\s*else\s*~?\}\})`)
rOpenInverse = regexp.MustCompile(`^\{\{~?\^`)
rOpenInverseChain = regexp.MustCompile(`^\{\{~?\s*else`)
// {{ or {{&
rOpen = regexp.MustCompile(`^\{\{~?&?`)
rClose = regexp.MustCompile(`^~?\}\}`)
rOpenBlockParams = regexp.MustCompile(`^as\s+\|`)
// {{!-- ... --}}
rOpenCommentDash = regexp.MustCompile(`^\{\{~?!--\s*`)
rCloseCommentDash = regexp.MustCompile(`^\s*--~?\}\}`)
// {{! ... }}
rOpenComment = regexp.MustCompile(`^\{\{~?!\s*`)
rCloseComment = regexp.MustCompile(`^\s*~?\}\}`)
)
// Scan scans given input.
//
// Tokens can then be fetched sequentially thanks to NextToken() function on returned lexer.
func Scan(input string) *Lexer {
return scanWithName(input, "")
}
// scanWithName scans given input, with a name used for testing
//
// Tokens can then be fetched sequentially thanks to NextToken() function on returned lexer.
func scanWithName(input string, name string) *Lexer {
result := &Lexer{
input: input,
name: name,
tokens: make(chan Token),
line: 1,
}
go result.run()
return result
}
// Collect scans and collect all tokens.
//
// This should be used for debugging purpose only. You should use Scan() and lexer.NextToken() functions instead.
func Collect(input string) []Token {
var result []Token
l := Scan(input)
for {
token := l.NextToken()
result = append(result, token)
if token.Kind == TokenEOF || token.Kind == TokenError {
break
}
}
return result
}
// NextToken returns the next scanned token.
func (l *Lexer) NextToken() Token {
result := <-l.tokens
return result
}
// run starts lexical analysis
func (l *Lexer) run() {
for l.nextFunc = lexContent; l.nextFunc != nil; {
l.nextFunc = l.nextFunc(l)
}
}
// next returns next character from input, or eof of there is nothing left to scan
func (l *Lexer) next() rune {
if l.pos >= len(l.input) {
l.width = 0
return eof
}
r, w := utf8.DecodeRuneInString(l.input[l.pos:])
l.width = w
l.pos += l.width
return r
}
func (l *Lexer) produce(kind TokenKind, val string) {
l.tokens <- Token{kind, val, l.start, l.line}
// scanning a new token
l.start = l.pos
// update line number
l.line += strings.Count(val, "\n")
}
// emit emits a new scanned token
func (l *Lexer) emit(kind TokenKind) {
l.produce(kind, l.input[l.start:l.pos])
}
// emitContent emits scanned content
func (l *Lexer) emitContent() {
if l.pos > l.start {
l.emit(TokenContent)
}
}
// emitString emits a scanned string
func (l *Lexer) emitString(delimiter rune) {
str := l.input[l.start:l.pos]
// replace escaped delimiters
str = strings.Replace(str, "\\"+string(delimiter), string(delimiter), -1)
l.produce(TokenString, str)
}
// peek returns but does not consume the next character in the input
func (l *Lexer) peek() rune {
r := l.next()
l.backup()
return r
}
// backup steps back one character
//
// WARNING: Can only be called once per call of next
func (l *Lexer) backup() {
l.pos -= l.width
}
// ignoreskips all characters that have been scanned up to current position
func (l *Lexer) ignore() {
l.start = l.pos
}
// accept scans the next character if it is included in given string
func (l *Lexer) accept(valid string) bool {
if strings.IndexRune(valid, l.next()) >= 0 {
return true
}
l.backup()
return false
}
// acceptRun scans all following characters that are part of given string
func (l *Lexer) acceptRun(valid string) {
for strings.IndexRune(valid, l.next()) >= 0 {
}
l.backup()
}
// errorf emits an error token
func (l *Lexer) errorf(format string, args ...interface{}) lexFunc {
l.tokens <- Token{TokenError, fmt.Sprintf(format, args...), l.start, l.line}
return nil
}
// isString returns true if content at current scanning position starts with given string
func (l *Lexer) isString(str string) bool {
return strings.HasPrefix(l.input[l.pos:], str)
}
// findRegexp returns the first string from current scanning position that matches given regular expression
func (l *Lexer) findRegexp(r *regexp.Regexp) string {
return r.FindString(l.input[l.pos:])
}
// indexRegexp returns the index of the first string from current scanning position that matches given regular expression
//
// It returns -1 if not found
func (l *Lexer) indexRegexp(r *regexp.Regexp) int {
loc := r.FindStringIndex(l.input[l.pos:])
if loc == nil {
return -1
}
return loc[0]
}
// lexContent scans content (ie: not between mustaches)
func lexContent(l *Lexer) lexFunc {
var next lexFunc
if l.rawBlock {
if i := l.indexRegexp(rOpenEndRawLookAhead); i != -1 {
// {{{{/
l.rawBlock = false
l.pos += i
next = lexOpenMustache
} else {
return l.errorf("Unclosed raw block")
}
} else if l.isString(escapedEscapedOpenMustache) {
// \\{{
// emit content with only one escaped escape
l.next()
l.emitContent()
// ignore second escaped escape
l.next()
l.ignore()
next = lexContent
} else if l.isString(escapedOpenMustache) {
// \{{
next = lexEscapedOpenMustache
} else if str := l.findRegexp(rOpenCommentDash); str != "" {
// {{!--
l.closeComment = rCloseCommentDash
next = lexComment
} else if str := l.findRegexp(rOpenComment); str != "" {
// {{!
l.closeComment = rCloseComment
next = lexComment
} else if l.isString(openMustache) {
// {{
next = lexOpenMustache
}
if next != nil {
// emit scanned content
l.emitContent()
// scan next token
return next
}
// scan next rune
if l.next() == eof {
// emit scanned content
l.emitContent()
// this is over
l.emit(TokenEOF)
return nil
}
// continue content scanning
return lexContent
}
// lexEscapedOpenMustache scans \{{
func lexEscapedOpenMustache(l *Lexer) lexFunc {
// ignore escape character
l.next()
l.ignore()
// scan mustaches
for l.peek() == '{' {
l.next()
}
return lexContent
}
// lexOpenMustache scans {{
func lexOpenMustache(l *Lexer) lexFunc {
var str string
var tok TokenKind
nextFunc := lexExpression
if str = l.findRegexp(rOpenEndRaw); str != "" {
tok = TokenOpenEndRawBlock
} else if str = l.findRegexp(rOpenRaw); str != "" {
tok = TokenOpenRawBlock
l.rawBlock = true
} else if str = l.findRegexp(rOpenUnescaped); str != "" {
tok = TokenOpenUnescaped
} else if str = l.findRegexp(rOpenBlock); str != "" {
tok = TokenOpenBlock
} else if str = l.findRegexp(rOpenEndBlock); str != "" {
tok = TokenOpenEndBlock
} else if str = l.findRegexp(rOpenPartial); str != "" {
tok = TokenOpenPartial
} else if str = l.findRegexp(rInverse); str != "" {
tok = TokenInverse
nextFunc = lexContent
} else if str = l.findRegexp(rOpenInverse); str != "" {
tok = TokenOpenInverse
} else if str = l.findRegexp(rOpenInverseChain); str != "" {
tok = TokenOpenInverseChain
} else if str = l.findRegexp(rOpen); str != "" {
tok = TokenOpen
} else {
// this is rotten
panic("Current pos MUST be an opening mustache")
}
l.pos += len(str)
l.emit(tok)
return nextFunc
}
// lexCloseMustache scans }} or ~}}
func lexCloseMustache(l *Lexer) lexFunc {
var str string
var tok TokenKind
if str = l.findRegexp(rCloseRaw); str != "" {
// }}}}
tok = TokenCloseRawBlock
} else if str = l.findRegexp(rCloseUnescaped); str != "" {
// }}}
tok = TokenCloseUnescaped
} else if str = l.findRegexp(rClose); str != "" {
// }}
tok = TokenClose
} else {
// this is rotten
panic("Current pos MUST be a closing mustache")
}
l.pos += len(str)
l.emit(tok)
return lexContent
}
// lexExpression scans inside mustaches
func lexExpression(l *Lexer) lexFunc {
// search close mustache delimiter
if l.isString(closeMustache) || l.isString(closeStripMustache) || l.isString(closeUnescapedStripMustache) {
return lexCloseMustache
}
// search some patterns before advancing scanning position
// "as |"
if str := l.findRegexp(rOpenBlockParams); str != "" {
l.pos += len(str)
l.emit(TokenOpenBlockParams)
return lexExpression
}
// ..
if l.isString("..") {
l.pos += len("..")
l.emit(TokenID)
return lexExpression
}
// .
if str := l.findRegexp(rDotID); str != "" {
l.pos += len(".")
l.emit(TokenID)
return lexExpression
}
// true
if str := l.findRegexp(rTrue); str != "" {
l.pos += len("true")
l.emit(TokenBoolean)
return lexExpression
}
// false
if str := l.findRegexp(rFalse); str != "" {
l.pos += len("false")
l.emit(TokenBoolean)
return lexExpression
}
// let's scan next character
switch r := l.next(); {
case r == eof:
return l.errorf("Unclosed expression")
case isIgnorable(r):
return lexIgnorable
case r == '(':
l.emit(TokenOpenSexpr)
case r == ')':
l.emit(TokenCloseSexpr)
case r == '=':
l.emit(TokenEquals)
case r == '@':
l.emit(TokenData)
case r == '"' || r == '\'':
l.backup()
return lexString
case r == '/' || r == '.':
l.emit(TokenSep)
case r == '|':
l.emit(TokenCloseBlockParams)
case r == '+' || r == '-' || (r >= '0' && r <= '9'):
l.backup()
return lexNumber
case r == '[':
return lexPathLiteral
case strings.IndexRune(unallowedIDChars, r) < 0:
l.backup()
return lexIdentifier
default:
return l.errorf("Unexpected character in expression: '%c'", r)
}
return lexExpression
}
// lexComment scans {{!-- or {{!
func lexComment(l *Lexer) lexFunc {
if str := l.findRegexp(l.closeComment); str != "" {
l.pos += len(str)
l.emit(TokenComment)
return lexContent
}
if r := l.next(); r == eof {
return l.errorf("Unclosed comment")
}
return lexComment
}
// lexIgnorable scans all following ignorable characters
func lexIgnorable(l *Lexer) lexFunc {
for isIgnorable(l.peek()) {
l.next()
}
l.ignore()
return lexExpression
}
// lexString scans a string
func lexString(l *Lexer) lexFunc {
// get string delimiter
delim := l.next()
var prev rune
// ignore delimiter
l.ignore()
for {
r := l.next()
if r == eof || r == '\n' {
return l.errorf("Unterminated string")
}
if (r == delim) && (prev != '\\') {
break
}
prev = r
}
// remove end delimiter
l.backup()
// emit string
l.emitString(delim)
// skip end delimiter
l.next()
l.ignore()
return lexExpression
}
// lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
// isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
// and "089" - but when it's wrong the input is invalid and the parser (via
// strconv) will notice.
//
// NOTE: borrowed from https://github.com/golang/go/tree/master/src/text/template/parse/lex.go
func lexNumber(l *Lexer) lexFunc {
if !l.scanNumber() {
return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
}
if sign := l.peek(); sign == '+' || sign == '-' {
// Complex: 1+2i. No spaces, must end in 'i'.
if !l.scanNumber() || l.input[l.pos-1] != 'i' {
return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
}
l.emit(TokenNumber)
} else {
l.emit(TokenNumber)
}
return lexExpression
}
// scanNumber scans a number
//
// NOTE: borrowed from https://github.com/golang/go/tree/master/src/text/template/parse/lex.go
func (l *Lexer) scanNumber() bool {
// Optional leading sign.
l.accept("+-")
// Is it hex?
digits := "0123456789"
if l.accept("0") && l.accept("xX") {
digits = "0123456789abcdefABCDEF"
}
l.acceptRun(digits)
if l.accept(".") {
l.acceptRun(digits)
}
if l.accept("eE") {
l.accept("+-")
l.acceptRun("0123456789")
}
// Is it imaginary?
l.accept("i")
// Next thing mustn't be alphanumeric.
if isAlphaNumeric(l.peek()) {
l.next()
return false
}
return true
}
// lexIdentifier scans an ID
func lexIdentifier(l *Lexer) lexFunc {
str := l.findRegexp(rID)
if len(str) == 0 {
// this is rotten
panic("Identifier expected")
}
l.pos += len(str)
l.emit(TokenID)
return lexExpression
}
// lexPathLiteral scans an [ID]
func lexPathLiteral(l *Lexer) lexFunc {
for {
r := l.next()
if r == eof || r == '\n' {
return l.errorf("Unterminated path literal")
}
if r == ']' {
break
}
}
l.emit(TokenID)
return lexExpression
}
// isIgnorable returns true if given character is ignorable (ie. whitespace of line feed)
func isIgnorable(r rune) bool {
return r == ' ' || r == '\t' || r == '\n'
}
// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
//
// NOTE borrowed from https://github.com/golang/go/tree/master/src/text/template/parse/lex.go
func isAlphaNumeric(r rune) bool {
return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
}