package otto import ( "fmt" "strings" "bytes" "unicode" "unicode/utf8" "unicode/utf16" "strconv" ) var keywordTable map[string]bool = boolFields(` break case catch continue default delete do else finally for function if in instanceof new null return switch this throw try typeof var while with void debugger const `) var punctuatorTable map[string]bool func init() { punctuatorTable = boolFields(` >>>= === !== >>> <<= >>= `) // 2-character // <= >= == != ++ -- << >> && || // += -= *= %= &= |= ^= /= for _, value := range "<>=!+-*%&|^/" { punctuatorTable[string(value) + "="] = true } for _, value := range "+-<>&|" { punctuatorTable[string(value) + string(value)] = true } // 1-character for _, value := range "[]<>+-*%&|^!~?:=/;{},()" { punctuatorTable[string(value)] = true } } type _token struct { Line, Column, Character int Kind, File, Text string Error bool } func (self _token) IsValid() bool { return self.Kind != "" } type _lexer struct { Source string //Tail int //Head int //Width int lineCount int zeroColumnOffset int readIn []rune readInOffset int atEndOfFile bool head int tail int headOffset int tailOffset int } // Only called for testing (for now) func newLexer(source string) _lexer { self := _lexer{ Source: source, readIn: make([]rune, 0, len(source)), // Guestimate } return self } func (self _lexer) Copy() *_lexer { newSelf := self return &newSelf } func (self *_lexer) scanEndOfLine(chr rune, consume bool) bool { if !isLineTerminator(chr) { return false } if consume { self.next() } if chr == '\r' && self.peek() == '\n' { self.next() // Consume \n } self.lineCount += 1 return true } func (self *_lexer) ScanLineComment() { for { chr := self.next() if chr == endOfFile || self.scanEndOfLine(chr, false) { return } } } func (self *_lexer) ScanBlockComment() int { lineCount := 0 for { chr := self.next() switch { case chr == '*' && self.peek() == '/': self.next() // / return lineCount case chr == endOfFile: panic(&_syntaxError{ Message: "Unexpected token ILLEGAL", }) case self.scanEndOfLine(chr, false): lineCount += 1 } } panic(hereBeDragons()) } func (self *_lexer) ScanSkip() int { lineCount := 0 for { chr := self.peek() switch { case chr == '/': read, _, found, width := self.read(2) switch read[1] { case '/': self.tail += found self.tailOffset += width self.ScanLineComment() lineCount += 1 case '*': self.tail += found self.tailOffset += width lineCount += self.ScanBlockComment() default: goto RETURN } self.ignore() self.zeroColumnOffset = self.tailOffset case isWhiteSpace(chr): self.next() self.ignore() case self.scanEndOfLine(chr, true): lineCount += 1 self.ignore() self.zeroColumnOffset = self.tailOffset default: goto RETURN } } RETURN: return lineCount } func (self *_lexer) ScanLineSkip() bool { return self.ScanSkip() > 0 } func (self *_lexer) ScanRegularExpression() _token { self.ScanSkip() token := self.scanQuoteLiteral() if token.Kind != "//" { panic(token.newSyntaxError("Invalid regular expression")) } return token } func (self *_lexer) Scan() (token _token) { self.ScanSkip() if self.peek() == endOfFile { return self.emit("EOF") } if token = self.scanPunctuator(); token.IsValid() { return } chr := self.peek() if chr == '\'' || chr == '"' { if token = self.scanQuoteLiteral(); token.IsValid() { return } } if chr == '.' || isDecimalDigit(chr) { if token = self.scanNumericLiteral(); token.IsValid() { return } } if token = self.scanIdentifierKeyword(); token.IsValid() { return } return self.scanIllegal() } func (self *_lexer) scanQuoteLiteral() _token { value := self.next() quote := value kind := "string" if value == '/' { kind = "//" } errorIllegal := func() _token { self.back() return self.emit("illegal") } var text bytes.Buffer for { value = self.next() switch value { case endOfFile: return errorIllegal() case quote: return self.emitWith(kind, text.String()) case '\\': value = self.next() if isLineTerminator(value) { if quote == '/' { return errorIllegal() } self.scanEndOfLine(value, false) continue } if quote == '/' { // RegularExpression // TODO Handle the case of [\]? text.WriteRune('\\') text.WriteRune(value) continue } switch value { case 'n': text.WriteRune('\n') case 'r': text.WriteRune('\r') case 't': text.WriteRune('\t') case 'b': text.WriteRune('\b') case 'f': text.WriteRune('\f') case 'v': text.WriteRune('\v') case '0': text.WriteRune(0) case 'u': result := self.scanHexadecimalRune(4) if result != utf8.RuneError { text.WriteRune(result) } else { text.WriteRune(value) } case 'x': result := self.scanHexadecimalRune(2) if result != utf8.RuneError { text.WriteRune(result) } else { text.WriteRune(value) } default: text.WriteRune(value) } // TODO Octal escaping default: if isLineTerminator(value) { return errorIllegal() } text.WriteRune(value) } } return self.emit("illegal") } func convertHexadecimalRune(word string) rune { value, err := strconv.ParseUint(word, 16, len(word) * 4) if err != nil { // Not a valid hexadecimal sequence return utf8.RuneError } return utf16.Decode([]uint16{uint16(value)})[0] } func (self *_lexer) scanHexadecimalRune(size int) rune { _, word, found, width := self.read(size) chr := convertHexadecimalRune(word) if chr == utf8.RuneError { return chr } self.tail += found self.tailOffset += width return chr } func (self *_lexer) scanPunctuator() (token _token) { if self.accept(";{},()") { return self.emit("punctuator") } accept := func(count int){ for count > 0 { count-- self.next() } } read, word, _, _ := self.read(4) if read[0] == '.' && !isDecimalDigit(read[1]) { accept(1) return self.emit("punctuator") } for len(word) > 0 { if punctuatorTable[word] { accept(len(word)) return self.emit("punctuator") } word = word[:len(word) - 1] } return // I think this doesn't make any sense //return self.emit("punctuator") } func (self *_lexer) scanNumericLiteral() _token { // FIXME Make sure this is according to the specification isHex, isOctal := false, false { self.accept(".") acceptable := "0123456789" if self.accept("0") { if self.accept("xX") { acceptable = "0123456789abcdefABCDEF" isHex = true } else if self.accept("01234567") { acceptable = "01234567" isOctal = true } else if self.accept("89") { return self.emit("illegal") } } self.acceptRun(acceptable) if !isHex && !isOctal && self.accept(".") { self.acceptRun(acceptable) } if self.length() == 2 && isHex { // 0x$ or 0X$ return self.emit("illegal") } } if !isHex && !isOctal && self.accept("eE") { self.accept("+-") length := self.length() self.acceptRun("0123456789") if length == self.length() { // e$ return self.emit("illegal") } } if isAlphaNumeric(self.peek()) { self.next() // Bad number return self.emit("illegal") } return self.emit("number") } func (self *_lexer) scanIdentifierKeyword() (token _token) { word := []rune{} // The first character should be of the class isIdentifierStart identifierCheck := isIdentifierStart for { switch chr := self.peek(); { case identifierCheck(chr): if chr == '\\' { read, _, _, _ := self.read(6) if read[1] == 'u' { chr := convertHexadecimalRune(string(read[2:])) if chr == utf8.RuneError { word = append(word, 'u') self.skip(2) // Skip \u } else { if chr == '\\' || !identifierCheck(chr) { return } word = append(word, chr) self.skip(6) // Skip \u???? } } else { return } } else { // Basically a skip of 1 word = append(word, self.next()) } default: if len(word) == 0 { // Did not scan at least one identifier character, so return with failure return } word := string(word) switch { case keywordTable[word] == true: return self.emitWith(word, word) case word == "true", word == "false": return self.emitWith("boolean", word) default: return self.emitWith("identifier", word) } return } // Now we're looking at the body of the identiifer identifierCheck = isIdentifierPart } return } func (self *_lexer) scanIllegal() _token { return self.emit("illegal") } func (self *_lexer) emitWith(kind string, text string) _token { token := _token{ Character: 1 + self.tailOffset, Line: 1 + self.lineCount, Column: 1 + self.tailOffset - self.zeroColumnOffset, Kind: kind, Text: text, Error: false, } if kind == "punctuator" { token.Kind = token.Text } self.headOffset = self.tailOffset self.head = self.tail if ottoDebug { fmt.Printf("emit: %s %s\n", token.Kind, token.Text) } if kind == "illegal" { token.Error = true } return token } func (self *_lexer) emit(kind string) _token { return self.emitWith(kind, self.word()) } func (self *_lexer) read(count int) ([]rune, string, int, int) { head := self.tail tail := head + count unread := tail - len(self.readIn) for unread > 0 { unread-- self.read1() } var read []rune found := 0 length := len(self.readIn) if tail >= length { read = make([]rune, count) index, head := 0, head for index < count { if head >= length { read[index] = endOfFile } else { found++ read[index] = self.readIn[head] } index++ head++ } } else { found = count read = self.readIn[head:tail] } width := 0 word := "" if found > 0 { width = len(string(read[:found])) word = string(read[:found]) } return read, word, found, width } func (self *_lexer) next() rune { chr, width := self.peek1() if width != 0 { self.tail += 1 self.tailOffset += width } return chr } func (self *_lexer) skip(count int) { read := self.readIn[self.tail:self.tail+count] for _, chr := range read { self.tail += 1 self.tailOffset += utf8.RuneLen(chr) } } func (self *_lexer) peek1() (chr rune, width int) { if self.tail < len(self.readIn) { chr = self.readIn[self.tail] width = utf8.RuneLen(chr) } else { chr, width = self.read1() } return } func (self *_lexer) read1() (rune, int) { if self.readInOffset >= len(self.Source) { self.atEndOfFile = true return endOfFile, 0 } chr, width := utf8.DecodeRuneInString(self.Source[self.readInOffset:]) self.readIn = append(self.readIn, chr) self.readInOffset += width return chr, width } func (self *_lexer) peek() rune { chr, _ := self.peek1() return chr } func (self *_lexer) back() { if self.tail > self.head && self.tail > 0 { self.tailOffset -= utf8.RuneLen(self.readIn[self.tail - 1]) self.tail -= 1 } } func (self *_lexer) ignore() { self.head = self.tail self.headOffset = self.tailOffset } func (self *_lexer) accept(valid string) bool { if strings.IndexRune(valid, self.peek()) >= 0 { self.next() return true } return false } func (self *_lexer) acceptRun(valid string) bool { found := false for strings.IndexRune(valid, self.peek()) >= 0 { self.next() found = true } return found } func (self *_lexer) word() string { return self.Source[self.headOffset:self.tailOffset] } func (self *_lexer) length() int { return self.tailOffset - self.headOffset } func isDecimalDigit(rune rune) bool { return unicode.IsDigit(rune) } func isAlphaNumeric(chr rune) bool { return chr == '_' || unicode.IsLetter(chr) || unicode.IsDigit(chr) } func isIdentifierStart(chr rune) bool { return chr == '$' || chr == '_' || chr == '\\' || unicode.IsLetter(chr) } func isIdentifierPart(chr rune) bool { return chr == '$' || chr == '_' || chr == '\\' || unicode.IsLetter(chr) || unicode.IsDigit(chr) } func isWhiteSpace(chr rune) bool { switch chr { case ' ', '\t', '\u00a0', '\u000b', '\u000c': return true } return false } func isLineTerminator(chr rune) bool { switch chr { case '\n', '\r', '\u2028', '\u2029': return true } return false }