def tokenize(expression)
tokens = []
chars = CharacterStream.new(expression.chars.to_a)
while chars.current
case TRANSLATION_TABLE[chars.current]
when nil
tokens << Token.new(
T_UNKNOWN,
chars.current,
chars.position
)
chars.next
when STATE_SINGLE_CHAR
tokens << Token.new(
SIMPLE_TOKENS[chars.current],
chars.current,
chars.position
)
chars.next
when STATE_IDENTIFIER
start = chars.position
buffer = []
begin
buffer << chars.current
chars.next
end while VALID_IDENTIFIERS.include?(chars.current)
tokens << Token.new(
T_IDENTIFIER,
buffer.join,
start
)
when STATE_WHITESPACE
chars.next
when STATE_LBRACKET
position = chars.position
actual = chars.next
if actual == ']'
chars.next
tokens << Token.new(T_FLATTEN, '[]', position)
elsif actual == '?'
chars.next
tokens << Token.new(T_FILTER, '[?', position)
else
tokens << Token.new(T_LBRACKET, '[', position)
end
when STATE_STRING_LITERAL
t = inside(chars, "'", T_LITERAL)
t.value = t.value.gsub("\\'", "'")
tokens << t
when STATE_PIPE
tokens << match_or(chars, '|', '|', T_OR, T_PIPE)
when STATE_JSON_LITERAL
token = inside(chars, '`', T_LITERAL)
if token.type == T_LITERAL
token.value = token.value.gsub('\\`', '`')
token = parse_json(token)
end
tokens << token
when STATE_NUMBER
start = chars.position
buffer = []
begin
buffer << chars.current
chars.next
end while NUMBERS.include?(chars.current)
tokens << Token.new(
T_NUMBER,
buffer.join.to_i,
start
)
when STATE_QUOTED_STRING
token = inside(chars, '"', T_QUOTED_IDENTIFIER)
if token.type == T_QUOTED_IDENTIFIER
token.value = "\"#{token.value}\""
token = parse_json(token, true)
end
tokens << token
when STATE_EQ
tokens << match_or(chars, '=', '=', T_COMPARATOR, T_UNKNOWN)
when STATE_AND
tokens << match_or(chars, '&', '&', T_AND, T_EXPREF)
when STATE_NOT
tokens << match_or(chars, '!', '=', T_COMPARATOR, T_NOT);
else
tokens << match_or(chars, chars.current, '=', T_COMPARATOR, T_COMPARATOR)
end
end
tokens << Token.new(T_EOF, nil, chars.position)
tokens
end