123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210 |
- class Lexer
- def initialize(source)
- @source = source
- @position = 0
- end
-
- def get_token
- skip_whitespace
- skip_comment
-
- return Token.new(TokenKinds::EOF) if at_end
-
- source = @source.slice(@position..-1)
-
- if source.match(/\Anull/)
- @position += 4
- Token.new(TokenKinds::NULL)
- elsif source.match(/\Atrue/)
- @position += 4
- Token.new(TokenKinds::BOOLEAN, true)
- elsif source.match(/\Afalse/)
- @position += 5
- Token.new(TokenKinds::BOOLEAN, false)
- elsif source.match(/\Aif/)
- @position += 2
- Token.new(TokenKinds::IF)
- elsif source.match(/\Aelseif/)
- @position += 6
- Token.new(TokenKinds::ELSEIF)
- elsif source.match(/\Aelse/)
- @position += 4
- Token.new(TokenKinds::ELSE)
- elsif source.match(/\A\d+(\.\d+)?/)
- number = source.match(/\A\d+(\.\d+)?/)[0]
- @position += number.size
- Token.new(TokenKinds::NUMBER, number.to_f)
- elsif source[0] == '"'
- @position += 1
- string = String.new
-
- while !at_end && @source[@position] != '"'
- if @source[@position] == '\\'
- if @source[@position + 1] == '"'
- @position += 1
- elsif @source[@position + 1] == 'n'
- @position += 2
- string << "\n"
- next
- elsif @source[@position + 1] == 't'
- @position += 2
- string << "\t"
- next
- end
- end
-
- string << @source[@position]
- @position += 1
- end
-
- if at_end
- raise 'Unterminated string'
- elsif @source[@position] == '"'
- @position += 1
- end
-
- Token.new(TokenKinds::STRING, string)
- elsif source.match(/\A\:([a-z][a-zA-Z0-9_]*)/)
- atom = source.match(/\A\:([a-z][a-zA-Z0-9_]*)/)[1]
- @position += atom.size + 1
- Token.new(TokenKinds::ATOM, atom.to_sym)
- elsif source.match(/\A\+/)
- @position += 1
- Token.new(TokenKinds::OPERATOR, :+)
- elsif source.match(/\A\-/)
- @position += 1
- Token.new(TokenKinds::OPERATOR, :-)
- elsif source.match(/\A\*/)
- @position += 1
- Token.new(TokenKinds::OPERATOR, :*)
- elsif source.match(%r{\A\/})
- @position += 1
- Token.new(TokenKinds::OPERATOR, :/)
- elsif source.match(/\A\{/)
- @position += 1
- Token.new(TokenKinds::LBRACE)
- elsif source.match(/\A\}/)
- @position += 1
- Token.new(TokenKinds::RBRACE)
- elsif source.match(/\A\(/)
- @position += 1
- Token.new(TokenKinds::LPAREN)
- elsif source.match(/\A\)/)
- @position += 1
- Token.new(TokenKinds::RPAREN)
- elsif source.match(/\A\[/)
- @position += 1
- Token.new(TokenKinds::LBRACKET)
- elsif source.match(/\A\]/)
- @position += 1
- Token.new(TokenKinds::RBRACKET)
- elsif source.match(/\A\;/)
- @position += 1
- Token.new(TokenKinds::SEMICOLON)
- elsif source.match(/\A,/)
- @position += 1
- Token.new(TokenKinds::COMMA)
- elsif source.match(/\A\./)
- @position += 1
- Token.new(TokenKinds::DOT)
- elsif source.match(/\A=>/)
- @position += 2
- Token.new(TokenKinds::ROCKET)
- elsif source.match(/\A\#/)
- @position += 1
- Token.new(TokenKinds::HASH)
- elsif source.match(/\A!/)
- @position += 1
- Token.new(TokenKinds::OPERATOR, :!)
- elsif source.match(/\A==/)
- @position += 2
- Token.new(TokenKinds::OPERATOR, :==)
- elsif source.match(/\A\<=/)
- @position += 2
- Token.new(TokenKinds::OPERATOR, :<=)
- elsif source.match(/\A\>\=/)
- @position += 2
- Token.new(TokenKinds::OPERATOR, :>=)
- elsif source.match(/\A\</)
- @position += 1
- Token.new(TokenKinds::OPERATOR, :<)
- elsif source.match(/\A\>/)
- @position += 1
- Token.new(TokenKinds::OPERATOR, :>)
- elsif source.match(/\Aand/)
- @position += 3
- Token.new(TokenKinds::OPERATOR, :and)
- elsif source.match(/\Aor/)
- @position += 2
- Token.new(TokenKinds::OPERATOR, :or)
- elsif source.match(/\Anot/)
- @position += 3
- Token.new(TokenKinds::OPERATOR, :not)
- elsif source.match(/\A\=/)
- @position += 1
- Token.new(TokenKinds::EQUALS)
- elsif source.match(/\Alet/)
- @position += 3
- Token.new(TokenKinds::LET)
- elsif source.match(/\Afunction/)
- @position += 8
- Token.new(TokenKinds::FUNCTION)
- elsif source.match(/\Aclass/)
- @position += 5
- Token.new(TokenKinds::CLASS)
- elsif source.match(/\Apublic/)
- @position += 6
- Token.new(TokenKinds::PUBLIC)
- elsif source.match(/\Aprivate/)
- @position += 7
- Token.new(TokenKinds::PRIVATE)
- elsif source.match(/\Afor/)
- @position += 3
- Token.new(TokenKinds::FOR)
- elsif source.match(/\Ain/)
- @position += 2
- Token.new(TokenKinds::IN)
- elsif source.match(/\A[a-z][a-zA-Z0-9_]*/)
- identifier = source.match(/\A[a-z][a-zA-Z0-9_]*/)[0]
- @position += identifier.size
- Token.new(TokenKinds::IDENTIFIER, identifier)
- elsif source.match(/\A[A-Z][a-zA-Z0-9_]*/)
- class_name = source.match(/\A[A-Z][a-zA-Z0-9_]*/)[0]
- @position += class_name.size
- Token.new(TokenKinds::CLASS_NAME, class_name)
- else
- raise "Unrecognized character #{source[0]}"
- end
- end
-
- def peek
- position = @position
- token = get_token
- @position = position
- token
- end
-
- def scan_all
- tokens = Array.new
- tokens << get_token until at_end
- tokens << Token.new(TokenKinds::EOF)
- end
-
- private
-
- def skip_whitespace
- @position += 1 while !at_end && @source[@position].match(/\A\s/)
- end
-
- def skip_comment
- if @source.slice(@position..@position + 1) == '//'
- @position += 2
- @position += 1 until @source[@position] == "\n" || at_end
- @position += 1 unless at_end
- end
- end
-
- def at_end
- @position == @source.size
- end
- end
|