123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177 |
- use lazy_static::lazy_static;
- use regex::Regex;
-
- use crate::error::KappeError;
- use crate::token::{Token, TokenType};
-
- #[derive(Debug)]
- struct Matcher {
- regex: Regex,
- token_type: TokenType,
- }
-
- impl Matcher {
- pub fn new(regex: &str, token_type: TokenType) -> Self {
- Self {
- regex: Regex::new(regex).unwrap(),
- token_type,
- }
- }
- }
-
- lazy_static! {
- static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s").unwrap();
- static ref MATCHERS: Vec<Matcher> = vec![
- Matcher::new(r#"^"(.*)""#, TokenType::String),
- Matcher::new(r#"(?i)^SELECT"#, TokenType::Select),
- Matcher::new(r#"(?i)^FROM"#, TokenType::From),
- Matcher::new(r#"(?i)^WHERE"#, TokenType::Where),
- Matcher::new(r#"^[a-z][a-zA-Z_\-.*]*"#, TokenType::Identfiier),
- Matcher::new(r#"^[0-9]+"#, TokenType::Number),
- Matcher::new(r#"^\*"#, TokenType::Star),
- Matcher::new(r#"^,"#, TokenType::Comma),
- Matcher::new(r#"^="#, TokenType::Equals),
- ];
- }
-
- pub fn scan(input: &str) -> Result<Vec<Token>, KappeError> {
- let mut tokens: Vec<Token> = vec![];
- let mut position = 0;
-
- while position < input.len() {
- while WHITESPACE_REGEX.is_match(&input[position..position + 1]) {
- position += 1;
- }
-
- let mut matched = false;
-
- for matcher in MATCHERS.iter() {
- if matcher.regex.is_match(&input[position..]) {
- if let Some(m) = matcher.regex.captures_iter(&input[position..]).next() {
- let value = if m.len() > 1 { &m[1] } else { &m[0] };
- position += value.len();
-
- if matcher.token_type == TokenType::String {
- position += 2
- };
-
- tokens.push(Token::new(matcher.token_type, value));
- }
-
- matched = true;
- break;
- }
- }
-
- if !matched {
- return Err(KappeError::new("Unrecognized sequence"));
- }
- }
-
- Ok(tokens)
- }
-
- #[cfg(test)]
- mod tests {
- use super::*;
-
- #[test]
- fn it_scans_a_number() {
- assert_eq!(
- scan("123").unwrap(),
- vec![Token::new(TokenType::Number, "123")]
- )
- }
-
- #[test]
- fn it_scans_a_string() {
- assert_eq!(
- scan("\"hello world\"").unwrap(),
- vec![Token::new(TokenType::String, "hello world")]
- )
- }
-
- #[test]
- fn it_scans_uppercase_keywords() {
- assert_eq!(
- scan("SELECT FROM WHERE").unwrap(),
- vec![
- Token::new(TokenType::Select, "SELECT"),
- Token::new(TokenType::From, "FROM"),
- Token::new(TokenType::Where, "WHERE"),
- ]
- )
- }
-
- #[test]
- fn it_scans_lowercase_keywords() {
- assert_eq!(
- scan("select from where").unwrap(),
- vec![
- Token::new(TokenType::Select, "select"),
- Token::new(TokenType::From, "from"),
- Token::new(TokenType::Where, "where"),
- ]
- )
- }
-
- #[test]
- fn it_scans_an_identifier() {
- assert_eq!(
- scan("abc").unwrap(),
- vec![Token::new(TokenType::Identfiier, "abc")]
- )
- }
-
- #[test]
- fn it_allows_hyphens_in_identifiers() {
- assert_eq!(
- scan("abc-def").unwrap(),
- vec![Token::new(TokenType::Identfiier, "abc-def")]
- )
- }
-
- #[test]
- fn it_scans_a_star() {
- assert_eq!(scan("*").unwrap(), vec![Token::new(TokenType::Star, "*")])
- }
-
- #[test]
- fn it_scans_a_comma() {
- assert_eq!(scan(",").unwrap(), vec![Token::new(TokenType::Comma, ",")])
- }
-
- #[test]
- fn it_scans_an_equals_sign() {
- assert_eq!(scan("=").unwrap(), vec![Token::new(TokenType::Equals, "=")])
- }
-
- #[test]
- fn it_allows_dots_in_identifiers() {
- assert_eq!(
- scan("foo.bar").unwrap(),
- vec![Token::new(TokenType::Identfiier, "foo.bar")]
- )
- }
-
- #[test]
- fn it_allows_stars_in_identifiers() {
- assert_eq!(
- scan("foo.*").unwrap(),
- vec![Token::new(TokenType::Identfiier, "foo.*")]
- )
- }
-
- #[test]
- fn it_scans_a_whole_expression() {
- assert_eq!(
- scan("SELECT * FROM index").unwrap(),
- vec![
- Token::new(TokenType::Select, "SELECT"),
- Token::new(TokenType::Star, "*"),
- Token::new(TokenType::From, "FROM"),
- Token::new(TokenType::Identfiier, "index"),
- ]
- )
- }
- }
|