otto/lexer.go

package otto

import (
	"fmt"
	"strings"
	"bytes"
	"unicode"
	"unicode/utf8"
	"unicode/utf16"
	"strconv"
)

var keywordTable map[string]bool = boolFields(`
	break
	case
	catch
	continue
	default
	delete
	do
	else
	finally
	for
	function
	if
	in
	instanceof
	new
	null
	return
	switch
	this
	throw
	try
	typeof
	var
	while
	with
	void
`)

var punctuatorTable map[string]bool
func init() {

	punctuatorTable = boolFields(`
		>>>= === !== >>> <<= >>=
	`)

	// 2-character
	// <= >= == != ++ -- << >> && ||
	// += -= *= %= &= |= ^= /=
	for _, value := range "<>=!+-*%&|^/" {
		punctuatorTable[string(value) + "="] = true
	}

	for _, value := range "+-<>&|" {
		punctuatorTable[string(value) + string(value)] = true
	}

	// 1-character
	for _, value := range "[]<>+-*%&|^!~?:=/;{},()" {
		punctuatorTable[string(value)] = true
	}
}

type _token struct {
	Line, Column, Character int
	Kind, File, Text string
	Error bool
}

func (self _token) IsValid() bool {
	return self.Kind != ""
}

type _lexer struct {
	Source		string
	Tail		int
	Head		int
	Width		int

	Line int
	LineHead int
}

func (self _lexer) Copy() *_lexer {
	newSelf := self
	return &newSelf
}

func (self *_lexer) scanEndOfLine(chr rune, consume bool) bool {
	if !isLineTerminator(chr) {
		return false
	}
	if consume {
		self.Next()
	}
	if chr == '\r' && self.Next() != '\n' {
		self.Back() // Back because the next character was NOT \n
	}
	self.Line += 1
	return true
}

func (self *_lexer) ScanLineComment() {
	for {
		chr := self.Next()
		if chr == endOfFile || self.scanEndOfLine(chr, false) {
			return
		}
	}
}

func (self *_lexer) ScanBlockComment() int {
	lineCount := 0
	for {
		chr := self.Next()
		switch {
		case chr == '*' && self.Peek() == '/':
			self.Next() // /
			return lineCount
		case chr == endOfFile:
			panic(&_syntaxError{
				Message: "Unexpected token ILLEGAL",
			})
		case self.scanEndOfLine(chr, false):
			lineCount += 1
		}
	}
	panic(hereBeDragons())
}

func (self *_lexer) ScanSkip() int {

	lineCount := 0

	for {
		chr := self.Peek()
		switch {
		case chr == '/':
			read, _, _ := self.Read(3)
			switch read[1] {
			case '/':
				self.ScanLineComment()
				lineCount += 1
			case '*':
				lineCount += self.ScanBlockComment()
			default:
				goto RETURN
			}
			self.Ignore()
			self.LineHead = self.Tail
		case isWhiteSpace(chr):
			self.Next()
			self.Ignore()
		case self.scanEndOfLine(chr, true):
			lineCount += 1
			self.Ignore()
			self.LineHead = self.Tail
		default:
			goto RETURN
		}
	}

RETURN:
	return lineCount
}

func (self *_lexer) ScanLineSkip() bool {
	return self.ScanSkip() > 0
}

func (self *_lexer) ScanRegularExpression() _token {

	self.ScanSkip()

	token := self.scanQuoteLiteral()
	if token.Kind != "//" {
		panic(token.newSyntaxError("Invalid regular expression"))
	}
	return token
}

func (self *_lexer) Scan() (token _token) {

	self.ScanSkip()

	if self.Peek() == endOfFile {
		return self.Emit("EOF")
	}

	if token = self.scanPunctuator(); token.IsValid() {
		return
	}

	rune := self.Peek()

	if rune == '\'' || rune == '"' {
		if token = self.scanQuoteLiteral(); token.IsValid() {
			return
		}
	}

	if rune == '.' || isDecimalDigit(rune) {
		if token = self.scanNumericLiteral(); token.IsValid() {
			return
		}
	}

	if token = self.scanIdentifierKeyword(); token.IsValid() {
		return
	}

	return self.scanIllegal()
}

func (self *_lexer) scanQuoteLiteral() _token {

	value := self.Next()
	quote := value
	kind := "string"
	if value == '/' {
		kind = "//"
	}

	error := func() _token {
		if self.Width != 0 {
			self.Back()
		}
		return self.Emit("illegal")
	}

	var text bytes.Buffer

	for {
		value = self.Next()
		switch value {
		case endOfFile:
			return error()
		case quote:
			return self.EmitWith(kind, text.String())
		case '\\':
			value = self.Next()
			if isLineTerminator(value) {
				if quote == '/' {
					return error()
				}
				self.scanEndOfLine(value, false)
				continue
			}
			if quote == '/' { // RegularExpression
				// TODO Handle the case of [\]?
				text.WriteRune('\\')
				text.WriteRune(value)
				continue
			}
			switch value {
			case 'n':
				text.WriteRune('\n')
			case 'r':
				text.WriteRune('\t')
			case 't':
				text.WriteRune('\t')
			case 'b':
				text.WriteRune('\t')
			case 'f':
				text.WriteRune('\t')
			case 'v':
				text.WriteRune('\t')
			default:
				text.WriteRune(value)
			case 'u':
				result := self.scanHexadecimalRune(4)
				if result != utf8.RuneError {
					text.WriteRune(result)
				} else {
					text.WriteRune(value)
				}

			case 'x':
				result := self.scanHexadecimalRune(2)
				if result != utf8.RuneError {
					text.WriteRune(result)
				} else {
					text.WriteRune(value)
				}
			}
			// TODO Octal escaping
		default:
			if isLineTerminator(value) {
				return error()
			}
			text.WriteRune(value)
		}
	}

	return error()
}

func (self *_lexer) scanHexadecimalRune(size int) rune {
	_, read, width := self.Read(size)
	value, err := strconv.ParseUint(read, 16, size * 4)
	if err != nil {
		// Not a valid hexadecimal sequence
		return utf8.RuneError
	}
	self.Tail += width
	return utf16.Decode([]uint16{uint16(value)})[0]
}

func (self *_lexer) scanPunctuator() (token _token) {

	if self.Accept(";{},()") {
		return self.Emit("punctuator")
	}

	accept := func(count int){
		for count > 0 {
			count--
			self.Next()
		}
	}

	read, word, _ := self.Read(4)

	if read[0] == '.' && !isDecimalDigit(read[1]) {
		accept(1)
		return self.Emit("punctuator")
	}

	for len(word) > 0 {
		if punctuatorTable[word] {
			accept(len(word))
			return self.Emit("punctuator")
		}
		word = word[:len(word) - 1]
	}

	return self.Emit("punctuator")
}

func (self *_lexer) scanNumericLiteral() _token {
	// FIXME Make sure this is according to the specification

	isHex, isOctal := false, false
	{
		self.Accept(".")

		acceptable := "0123456789"
		if self.Accept("0") {
			if self.Accept("xX") {
				acceptable = "0123456789abcdefABCDEF"
				isHex = true
			} else if self.Accept("01234567") {
				acceptable = "01234567"
				isOctal = true
			} else if self.Accept("89") {
				return self.Emit("illegal")
			}
		}

		self.AcceptRun(acceptable)
		if !isHex && !isOctal && self.Accept(".") {
			self.AcceptRun(acceptable)
		}

		if self.Length() == 2 && isHex { // 0x$ or 0X$
			return self.Emit("illegal")
		}
	}

	if !isHex && !isOctal && self.Accept("eE") {
		self.Accept("+-")
		length := self.Length()
		self.AcceptRun("0123456789")
		if length == self.Length() { // <number>e$
			return self.Emit("illegal")
		}
	}

	if isAlphaNumeric(self.Peek()) {
		self.Next()
		// Bad number
		return self.Emit("illegal")
	}

	return self.Emit("number")
}

func (self *_lexer) scanIdentifierKeyword() (token _token) {
	if !isIdentifierStart(self.Peek()) {
		return
	}
	for {
		switch chr := self.Peek(); {
		case isAlphaNumericDollar(chr):
			self.Next()
		default:
			word := self.Word()
			switch {
			case keywordTable[word] == true:
				return self.Emit(word)
			case word == "true", word == "false":
				return self.Emit("boolean")
			default:
				return self.Emit("identifier")
			}
			return
		}
	}
	return
}

func (self *_lexer) scanIllegal() _token {
	return self.Emit("illegal")
}

func (self *_lexer) EmitWith(kind string, text string) _token {
	token := _token{
		Character: 1 + self.Head,
		Line: 1 + self.Line,
		Column: 1 + self.Head - self.LineHead,

		Kind: kind,
		Text: text,
		Error: false,
	}
	if kind == "punctuator" {
		token.Kind = token.Text
	}
	self.Head = self.Tail
	if ottoDebug {
		fmt.Printf("emit: %s %s\n", token.Kind, token.Text)
	}
	if kind == "illegal" {
		token.Error = true
	}
	return token
}

func (self *_lexer) Emit(kind string) _token {
	return self.EmitWith(kind, self.Word())
}

func (self *_lexer) Read(count int) ([]rune, string, int) {
	read := make([]rune, count)
	tail := self.Tail
	found := 0
	for i := 0; i < count; i++ {
		if tail >= len(self.Source) {
			read[i] = endOfFile
			continue
		}
		width := 0
		read[i], width = utf8.DecodeRuneInString(self.Source[tail:])
		tail += width
		found = i
	}
	distance := tail - self.Tail
	word := string(read[:found + 1])
	return read, word, distance
}

func (self *_lexer) Next() (chr rune) {
	chr, self.Width = self._Peek()
	self.Tail += self.Width
	return chr
}

func (self *_lexer) _Peek() (rune, int) {
	if self.Tail >= len(self.Source) {
		return endOfFile, 0
	}
	chr, width := utf8.DecodeRuneInString(self.Source[self.Tail:])
	return chr, width
}

func (self *_lexer) Peek() rune {
	chr, _ := self._Peek()
	return chr
}

func (self *_lexer) Back() {
	if self.Width == 0 {
		panic(hereBeDragons("Can't backup when self.Width == 0"))
	}
	self.Tail -= self.Width
}

func (self *_lexer) Ignore() {
	self.Head = self.Tail
}

func (self *_lexer) Accept(valid string) bool {
	if strings.IndexRune(valid, self.Peek()) >= 0 {
		self.Next()
		return true
	}
	return false
}

func (self *_lexer) AcceptRun(valid string) bool {
	found := false
	for strings.IndexRune(valid, self.Peek()) >= 0 {
		self.Next()
		found = true
	}
	return found
}

func (self *_lexer) Word() string {
	return self.Source[self.Head:self.Tail]
}

func (self *_lexer) Length() int {
	return self.Tail - self.Head
}

func isDecimalDigit(rune rune) bool {
	return unicode.IsDigit(rune)
}

func isAlphaNumeric(rune rune) bool {
	return rune == '_' || unicode.IsLetter(rune) || unicode.IsDigit(rune)
}

func isAlphaNumericDollar(rune rune) bool {
	return rune == '$' || rune == '_' || unicode.IsLetter(rune) || unicode.IsDigit(rune)
}

func isIdentifierStart(rune rune) bool {
	return rune == '$' || rune == '_' || unicode.IsLetter(rune)
}

func isWhiteSpace(chr rune) bool {
	switch chr {
	case ' ', '\t':
		return true
	}
	return false
}

func isLineTerminator(chr rune) bool {
	switch chr {
	case '\n', '\r', '\u2028', '\u2029':
		return true
	}
	return false
}