640 lines
15 KiB
Go
640 lines
15 KiB
Go
// Package lexer provides a handlebars tokenizer.
|
|
package lexer
|
|
|
|
import (
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// References:
|
|
// - https://github.com/wycats/handlebars.js/blob/master/src/handlebars.l
|
|
// - https://github.com/golang/go/blob/master/src/text/template/parse/lex.go
|
|
|
|
const (
|
|
// Mustaches detection
|
|
escapedEscapedOpenMustache = "\\\\{{"
|
|
escapedOpenMustache = "\\{{"
|
|
openMustache = "{{"
|
|
closeMustache = "}}"
|
|
closeStripMustache = "~}}"
|
|
closeUnescapedStripMustache = "}~}}"
|
|
)
|
|
|
|
const eof = -1
|
|
|
|
// lexFunc represents a function that returns the next lexer function.
|
|
type lexFunc func(*Lexer) lexFunc
|
|
|
|
// Lexer is a lexical analyzer.
|
|
type Lexer struct {
|
|
input string // input to scan
|
|
name string // lexer name, used for testing purpose
|
|
tokens chan Token // channel of scanned tokens
|
|
nextFunc lexFunc // the next function to execute
|
|
|
|
pos int // current byte position in input string
|
|
line int // current line position in input string
|
|
width int // size of last rune scanned from input string
|
|
start int // start position of the token we are scanning
|
|
|
|
// the shameful contextual properties needed because `nextFunc` is not enough
|
|
closeComment *regexp.Regexp // regexp to scan close of current comment
|
|
rawBlock bool // are we parsing a raw block content ?
|
|
}
|
|
|
|
var (
|
|
lookheadChars = `[\s` + regexp.QuoteMeta("=~}/)|") + `]`
|
|
literalLookheadChars = `[\s` + regexp.QuoteMeta("~})") + `]`
|
|
|
|
// characters not allowed in an identifier
|
|
unallowedIDChars = " \n\t!\"#%&'()*+,./;<=>@[\\]^`{|}~"
|
|
|
|
// regular expressions
|
|
rID = regexp.MustCompile(`^[^` + regexp.QuoteMeta(unallowedIDChars) + `]+`)
|
|
rDotID = regexp.MustCompile(`^\.` + lookheadChars)
|
|
rTrue = regexp.MustCompile(`^true` + literalLookheadChars)
|
|
rFalse = regexp.MustCompile(`^false` + literalLookheadChars)
|
|
rOpenRaw = regexp.MustCompile(`^\{\{\{\{`)
|
|
rCloseRaw = regexp.MustCompile(`^\}\}\}\}`)
|
|
rOpenEndRaw = regexp.MustCompile(`^\{\{\{\{/`)
|
|
rOpenEndRawLookAhead = regexp.MustCompile(`\{\{\{\{/`)
|
|
rOpenUnescaped = regexp.MustCompile(`^\{\{~?\{`)
|
|
rCloseUnescaped = regexp.MustCompile(`^\}~?\}\}`)
|
|
rOpenBlock = regexp.MustCompile(`^\{\{~?#`)
|
|
rOpenEndBlock = regexp.MustCompile(`^\{\{~?/`)
|
|
rOpenPartial = regexp.MustCompile(`^\{\{~?>`)
|
|
// {{^}} or {{else}}
|
|
rInverse = regexp.MustCompile(`^(\{\{~?\^\s*~?\}\}|\{\{~?\s*else\s*~?\}\})`)
|
|
rOpenInverse = regexp.MustCompile(`^\{\{~?\^`)
|
|
rOpenInverseChain = regexp.MustCompile(`^\{\{~?\s*else`)
|
|
// {{ or {{&
|
|
rOpen = regexp.MustCompile(`^\{\{~?&?`)
|
|
rClose = regexp.MustCompile(`^~?\}\}`)
|
|
rOpenBlockParams = regexp.MustCompile(`^as\s+\|`)
|
|
// {{!-- ... --}}
|
|
rOpenCommentDash = regexp.MustCompile(`^\{\{~?!--\s*`)
|
|
rCloseCommentDash = regexp.MustCompile(`^\s*--~?\}\}`)
|
|
// {{! ... }}
|
|
rOpenComment = regexp.MustCompile(`^\{\{~?!\s*`)
|
|
rCloseComment = regexp.MustCompile(`^\s*~?\}\}`)
|
|
)
|
|
|
|
// Scan scans given input.
|
|
//
|
|
// Tokens can then be fetched sequentially thanks to NextToken() function on returned lexer.
|
|
func Scan(input string) *Lexer {
|
|
return scanWithName(input, "")
|
|
}
|
|
|
|
// scanWithName scans given input, with a name used for testing
|
|
//
|
|
// Tokens can then be fetched sequentially thanks to NextToken() function on returned lexer.
|
|
func scanWithName(input string, name string) *Lexer {
|
|
result := &Lexer{
|
|
input: input,
|
|
name: name,
|
|
tokens: make(chan Token),
|
|
line: 1,
|
|
}
|
|
|
|
go result.run()
|
|
|
|
return result
|
|
}
|
|
|
|
// Collect scans and collect all tokens.
|
|
//
|
|
// This should be used for debugging purpose only. You should use Scan() and lexer.NextToken() functions instead.
|
|
func Collect(input string) []Token {
|
|
var result []Token
|
|
|
|
l := Scan(input)
|
|
for {
|
|
token := l.NextToken()
|
|
result = append(result, token)
|
|
|
|
if token.Kind == TokenEOF || token.Kind == TokenError {
|
|
break
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// NextToken returns the next scanned token.
|
|
func (l *Lexer) NextToken() Token {
|
|
result := <-l.tokens
|
|
|
|
return result
|
|
}
|
|
|
|
// run starts lexical analysis
|
|
func (l *Lexer) run() {
|
|
for l.nextFunc = lexContent; l.nextFunc != nil; {
|
|
l.nextFunc = l.nextFunc(l)
|
|
}
|
|
}
|
|
|
|
// next returns next character from input, or eof of there is nothing left to scan
|
|
func (l *Lexer) next() rune {
|
|
if l.pos >= len(l.input) {
|
|
l.width = 0
|
|
return eof
|
|
}
|
|
|
|
r, w := utf8.DecodeRuneInString(l.input[l.pos:])
|
|
l.width = w
|
|
l.pos += l.width
|
|
|
|
return r
|
|
}
|
|
|
|
func (l *Lexer) produce(kind TokenKind, val string) {
|
|
l.tokens <- Token{kind, val, l.start, l.line}
|
|
|
|
// scanning a new token
|
|
l.start = l.pos
|
|
|
|
// update line number
|
|
l.line += strings.Count(val, "\n")
|
|
}
|
|
|
|
// emit emits a new scanned token
|
|
func (l *Lexer) emit(kind TokenKind) {
|
|
l.produce(kind, l.input[l.start:l.pos])
|
|
}
|
|
|
|
// emitContent emits scanned content
|
|
func (l *Lexer) emitContent() {
|
|
if l.pos > l.start {
|
|
l.emit(TokenContent)
|
|
}
|
|
}
|
|
|
|
// emitString emits a scanned string
|
|
func (l *Lexer) emitString(delimiter rune) {
|
|
str := l.input[l.start:l.pos]
|
|
|
|
// replace escaped delimiters
|
|
str = strings.Replace(str, "\\"+string(delimiter), string(delimiter), -1)
|
|
|
|
l.produce(TokenString, str)
|
|
}
|
|
|
|
// peek returns but does not consume the next character in the input
|
|
func (l *Lexer) peek() rune {
|
|
r := l.next()
|
|
l.backup()
|
|
return r
|
|
}
|
|
|
|
// backup steps back one character
|
|
//
|
|
// WARNING: Can only be called once per call of next
|
|
func (l *Lexer) backup() {
|
|
l.pos -= l.width
|
|
}
|
|
|
|
// ignoreskips all characters that have been scanned up to current position
|
|
func (l *Lexer) ignore() {
|
|
l.start = l.pos
|
|
}
|
|
|
|
// accept scans the next character if it is included in given string
|
|
func (l *Lexer) accept(valid string) bool {
|
|
if strings.IndexRune(valid, l.next()) >= 0 {
|
|
return true
|
|
}
|
|
|
|
l.backup()
|
|
|
|
return false
|
|
}
|
|
|
|
// acceptRun scans all following characters that are part of given string
|
|
func (l *Lexer) acceptRun(valid string) {
|
|
for strings.IndexRune(valid, l.next()) >= 0 {
|
|
}
|
|
|
|
l.backup()
|
|
}
|
|
|
|
// errorf emits an error token
|
|
func (l *Lexer) errorf(format string, args ...interface{}) lexFunc {
|
|
l.tokens <- Token{TokenError, fmt.Sprintf(format, args...), l.start, l.line}
|
|
return nil
|
|
}
|
|
|
|
// isString returns true if content at current scanning position starts with given string
|
|
func (l *Lexer) isString(str string) bool {
|
|
return strings.HasPrefix(l.input[l.pos:], str)
|
|
}
|
|
|
|
// findRegexp returns the first string from current scanning position that matches given regular expression
|
|
func (l *Lexer) findRegexp(r *regexp.Regexp) string {
|
|
return r.FindString(l.input[l.pos:])
|
|
}
|
|
|
|
// indexRegexp returns the index of the first string from current scanning position that matches given regular expression
|
|
//
|
|
// It returns -1 if not found
|
|
func (l *Lexer) indexRegexp(r *regexp.Regexp) int {
|
|
loc := r.FindStringIndex(l.input[l.pos:])
|
|
if loc == nil {
|
|
return -1
|
|
}
|
|
return loc[0]
|
|
}
|
|
|
|
// lexContent scans content (ie: not between mustaches)
|
|
func lexContent(l *Lexer) lexFunc {
|
|
var next lexFunc
|
|
|
|
if l.rawBlock {
|
|
if i := l.indexRegexp(rOpenEndRawLookAhead); i != -1 {
|
|
// {{{{/
|
|
l.rawBlock = false
|
|
l.pos += i
|
|
|
|
next = lexOpenMustache
|
|
} else {
|
|
return l.errorf("Unclosed raw block")
|
|
}
|
|
} else if l.isString(escapedEscapedOpenMustache) {
|
|
// \\{{
|
|
|
|
// emit content with only one escaped escape
|
|
l.next()
|
|
l.emitContent()
|
|
|
|
// ignore second escaped escape
|
|
l.next()
|
|
l.ignore()
|
|
|
|
next = lexContent
|
|
} else if l.isString(escapedOpenMustache) {
|
|
// \{{
|
|
next = lexEscapedOpenMustache
|
|
} else if str := l.findRegexp(rOpenCommentDash); str != "" {
|
|
// {{!--
|
|
l.closeComment = rCloseCommentDash
|
|
|
|
next = lexComment
|
|
} else if str := l.findRegexp(rOpenComment); str != "" {
|
|
// {{!
|
|
l.closeComment = rCloseComment
|
|
|
|
next = lexComment
|
|
} else if l.isString(openMustache) {
|
|
// {{
|
|
next = lexOpenMustache
|
|
}
|
|
|
|
if next != nil {
|
|
// emit scanned content
|
|
l.emitContent()
|
|
|
|
// scan next token
|
|
return next
|
|
}
|
|
|
|
// scan next rune
|
|
if l.next() == eof {
|
|
// emit scanned content
|
|
l.emitContent()
|
|
|
|
// this is over
|
|
l.emit(TokenEOF)
|
|
return nil
|
|
}
|
|
|
|
// continue content scanning
|
|
return lexContent
|
|
}
|
|
|
|
// lexEscapedOpenMustache scans \{{
|
|
func lexEscapedOpenMustache(l *Lexer) lexFunc {
|
|
// ignore escape character
|
|
l.next()
|
|
l.ignore()
|
|
|
|
// scan mustaches
|
|
for l.peek() == '{' {
|
|
l.next()
|
|
}
|
|
|
|
return lexContent
|
|
}
|
|
|
|
// lexOpenMustache scans {{
|
|
func lexOpenMustache(l *Lexer) lexFunc {
|
|
var str string
|
|
var tok TokenKind
|
|
|
|
nextFunc := lexExpression
|
|
|
|
if str = l.findRegexp(rOpenEndRaw); str != "" {
|
|
tok = TokenOpenEndRawBlock
|
|
} else if str = l.findRegexp(rOpenRaw); str != "" {
|
|
tok = TokenOpenRawBlock
|
|
l.rawBlock = true
|
|
} else if str = l.findRegexp(rOpenUnescaped); str != "" {
|
|
tok = TokenOpenUnescaped
|
|
} else if str = l.findRegexp(rOpenBlock); str != "" {
|
|
tok = TokenOpenBlock
|
|
} else if str = l.findRegexp(rOpenEndBlock); str != "" {
|
|
tok = TokenOpenEndBlock
|
|
} else if str = l.findRegexp(rOpenPartial); str != "" {
|
|
tok = TokenOpenPartial
|
|
} else if str = l.findRegexp(rInverse); str != "" {
|
|
tok = TokenInverse
|
|
nextFunc = lexContent
|
|
} else if str = l.findRegexp(rOpenInverse); str != "" {
|
|
tok = TokenOpenInverse
|
|
} else if str = l.findRegexp(rOpenInverseChain); str != "" {
|
|
tok = TokenOpenInverseChain
|
|
} else if str = l.findRegexp(rOpen); str != "" {
|
|
tok = TokenOpen
|
|
} else {
|
|
// this is rotten
|
|
panic("Current pos MUST be an opening mustache")
|
|
}
|
|
|
|
l.pos += len(str)
|
|
l.emit(tok)
|
|
|
|
return nextFunc
|
|
}
|
|
|
|
// lexCloseMustache scans }} or ~}}
|
|
func lexCloseMustache(l *Lexer) lexFunc {
|
|
var str string
|
|
var tok TokenKind
|
|
|
|
if str = l.findRegexp(rCloseRaw); str != "" {
|
|
// }}}}
|
|
tok = TokenCloseRawBlock
|
|
} else if str = l.findRegexp(rCloseUnescaped); str != "" {
|
|
// }}}
|
|
tok = TokenCloseUnescaped
|
|
} else if str = l.findRegexp(rClose); str != "" {
|
|
// }}
|
|
tok = TokenClose
|
|
} else {
|
|
// this is rotten
|
|
panic("Current pos MUST be a closing mustache")
|
|
}
|
|
|
|
l.pos += len(str)
|
|
l.emit(tok)
|
|
|
|
return lexContent
|
|
}
|
|
|
|
// lexExpression scans inside mustaches
|
|
func lexExpression(l *Lexer) lexFunc {
|
|
// search close mustache delimiter
|
|
if l.isString(closeMustache) || l.isString(closeStripMustache) || l.isString(closeUnescapedStripMustache) {
|
|
return lexCloseMustache
|
|
}
|
|
|
|
// search some patterns before advancing scanning position
|
|
|
|
// "as |"
|
|
if str := l.findRegexp(rOpenBlockParams); str != "" {
|
|
l.pos += len(str)
|
|
l.emit(TokenOpenBlockParams)
|
|
return lexExpression
|
|
}
|
|
|
|
// ..
|
|
if l.isString("..") {
|
|
l.pos += len("..")
|
|
l.emit(TokenID)
|
|
return lexExpression
|
|
}
|
|
|
|
// .
|
|
if str := l.findRegexp(rDotID); str != "" {
|
|
l.pos += len(".")
|
|
l.emit(TokenID)
|
|
return lexExpression
|
|
}
|
|
|
|
// true
|
|
if str := l.findRegexp(rTrue); str != "" {
|
|
l.pos += len("true")
|
|
l.emit(TokenBoolean)
|
|
return lexExpression
|
|
}
|
|
|
|
// false
|
|
if str := l.findRegexp(rFalse); str != "" {
|
|
l.pos += len("false")
|
|
l.emit(TokenBoolean)
|
|
return lexExpression
|
|
}
|
|
|
|
// let's scan next character
|
|
switch r := l.next(); {
|
|
case r == eof:
|
|
return l.errorf("Unclosed expression")
|
|
case isIgnorable(r):
|
|
return lexIgnorable
|
|
case r == '(':
|
|
l.emit(TokenOpenSexpr)
|
|
case r == ')':
|
|
l.emit(TokenCloseSexpr)
|
|
case r == '=':
|
|
l.emit(TokenEquals)
|
|
case r == '@':
|
|
l.emit(TokenData)
|
|
case r == '"' || r == '\'':
|
|
l.backup()
|
|
return lexString
|
|
case r == '/' || r == '.':
|
|
l.emit(TokenSep)
|
|
case r == '|':
|
|
l.emit(TokenCloseBlockParams)
|
|
case r == '+' || r == '-' || (r >= '0' && r <= '9'):
|
|
l.backup()
|
|
return lexNumber
|
|
case r == '[':
|
|
return lexPathLiteral
|
|
case strings.IndexRune(unallowedIDChars, r) < 0:
|
|
l.backup()
|
|
return lexIdentifier
|
|
default:
|
|
return l.errorf("Unexpected character in expression: '%c'", r)
|
|
}
|
|
|
|
return lexExpression
|
|
}
|
|
|
|
// lexComment scans {{!-- or {{!
|
|
func lexComment(l *Lexer) lexFunc {
|
|
if str := l.findRegexp(l.closeComment); str != "" {
|
|
l.pos += len(str)
|
|
l.emit(TokenComment)
|
|
|
|
return lexContent
|
|
}
|
|
|
|
if r := l.next(); r == eof {
|
|
return l.errorf("Unclosed comment")
|
|
}
|
|
|
|
return lexComment
|
|
}
|
|
|
|
// lexIgnorable scans all following ignorable characters
|
|
func lexIgnorable(l *Lexer) lexFunc {
|
|
for isIgnorable(l.peek()) {
|
|
l.next()
|
|
}
|
|
l.ignore()
|
|
|
|
return lexExpression
|
|
}
|
|
|
|
// lexString scans a string
|
|
func lexString(l *Lexer) lexFunc {
|
|
// get string delimiter
|
|
delim := l.next()
|
|
var prev rune
|
|
|
|
// ignore delimiter
|
|
l.ignore()
|
|
|
|
for {
|
|
r := l.next()
|
|
if r == eof || r == '\n' {
|
|
return l.errorf("Unterminated string")
|
|
}
|
|
|
|
if (r == delim) && (prev != '\\') {
|
|
break
|
|
}
|
|
|
|
prev = r
|
|
}
|
|
|
|
// remove end delimiter
|
|
l.backup()
|
|
|
|
// emit string
|
|
l.emitString(delim)
|
|
|
|
// skip end delimiter
|
|
l.next()
|
|
l.ignore()
|
|
|
|
return lexExpression
|
|
}
|
|
|
|
// lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
|
|
// isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
|
|
// and "089" - but when it's wrong the input is invalid and the parser (via
|
|
// strconv) will notice.
|
|
//
|
|
// NOTE: borrowed from https://github.com/golang/go/tree/master/src/text/template/parse/lex.go
|
|
func lexNumber(l *Lexer) lexFunc {
|
|
if !l.scanNumber() {
|
|
return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
|
|
}
|
|
if sign := l.peek(); sign == '+' || sign == '-' {
|
|
// Complex: 1+2i. No spaces, must end in 'i'.
|
|
if !l.scanNumber() || l.input[l.pos-1] != 'i' {
|
|
return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
|
|
}
|
|
l.emit(TokenNumber)
|
|
} else {
|
|
l.emit(TokenNumber)
|
|
}
|
|
return lexExpression
|
|
}
|
|
|
|
// scanNumber scans a number
|
|
//
|
|
// NOTE: borrowed from https://github.com/golang/go/tree/master/src/text/template/parse/lex.go
|
|
func (l *Lexer) scanNumber() bool {
|
|
// Optional leading sign.
|
|
l.accept("+-")
|
|
|
|
// Is it hex?
|
|
digits := "0123456789"
|
|
|
|
if l.accept("0") && l.accept("xX") {
|
|
digits = "0123456789abcdefABCDEF"
|
|
}
|
|
|
|
l.acceptRun(digits)
|
|
|
|
if l.accept(".") {
|
|
l.acceptRun(digits)
|
|
}
|
|
|
|
if l.accept("eE") {
|
|
l.accept("+-")
|
|
l.acceptRun("0123456789")
|
|
}
|
|
|
|
// Is it imaginary?
|
|
l.accept("i")
|
|
|
|
// Next thing mustn't be alphanumeric.
|
|
if isAlphaNumeric(l.peek()) {
|
|
l.next()
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// lexIdentifier scans an ID
|
|
func lexIdentifier(l *Lexer) lexFunc {
|
|
str := l.findRegexp(rID)
|
|
if len(str) == 0 {
|
|
// this is rotten
|
|
panic("Identifier expected")
|
|
}
|
|
|
|
l.pos += len(str)
|
|
l.emit(TokenID)
|
|
|
|
return lexExpression
|
|
}
|
|
|
|
// lexPathLiteral scans an [ID]
|
|
func lexPathLiteral(l *Lexer) lexFunc {
|
|
for {
|
|
r := l.next()
|
|
if r == eof || r == '\n' {
|
|
return l.errorf("Unterminated path literal")
|
|
}
|
|
|
|
if r == ']' {
|
|
break
|
|
}
|
|
}
|
|
|
|
l.emit(TokenID)
|
|
|
|
return lexExpression
|
|
}
|
|
|
|
// isIgnorable returns true if given character is ignorable (ie. whitespace of line feed)
|
|
func isIgnorable(r rune) bool {
|
|
return r == ' ' || r == '\t' || r == '\n'
|
|
}
|
|
|
|
// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
|
|
//
|
|
// NOTE borrowed from https://github.com/golang/go/tree/master/src/text/template/parse/lex.go
|
|
func isAlphaNumeric(r rune) bool {
|
|
return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
|
|
}
|