use lazy_static::lazy_static; use regex::Regex; use crate::error::KappeError; use crate::token::{Token, TokenType}; #[derive(Debug)] struct Matcher { regex: Regex, token_type: TokenType, } impl Matcher { pub fn new(regex: &str, token_type: TokenType) -> Self { Self { regex: Regex::new(regex).unwrap(), token_type, } } } lazy_static! { static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s").unwrap(); static ref MATCHERS: Vec = vec![ Matcher::new(r#"^"(.*)""#, TokenType::String), Matcher::new(r#"^SELECT"#, TokenType::Select), Matcher::new(r#"^FROM"#, TokenType::From), Matcher::new(r#"^[a-z][a-zA-Z_\-.*]*"#, TokenType::Identfiier), Matcher::new(r#"^[0-9]+"#, TokenType::Number), Matcher::new(r#"^\*"#, TokenType::Star), Matcher::new(r#"^,"#, TokenType::Comma), ]; } pub fn scan(input: &str) -> Result, KappeError> { let mut tokens: Vec = vec![]; let mut position = 0; while position < input.len() { while WHITESPACE_REGEX.is_match(&input[position..position + 1]) { position += 1; } let mut matched = false; for matcher in MATCHERS.iter() { if matcher.regex.is_match(&input[position..]) { if let Some(m) = matcher.regex.captures_iter(&input[position..]).next() { let value = if m.len() > 1 { &m[1] } else { &m[0] }; position += value.len(); if matcher.token_type == TokenType::String { position += 2 }; tokens.push(Token::new(matcher.token_type, value)); } matched = true; break; } } if !matched { return Err(KappeError::new("Unrecognized sequence")); } } Ok(tokens) } #[cfg(test)] mod tests { use super::*; #[test] fn it_scans_a_number() { assert_eq!( scan("123").unwrap(), vec![Token::new(TokenType::Number, "123")] ) } #[test] fn it_scans_a_string() { assert_eq!( scan("\"hello world\"").unwrap(), vec![Token::new(TokenType::String, "hello world")] ) } #[test] fn it_scans_a_keyword() { assert_eq!( scan("SELECT").unwrap(), vec![Token::new(TokenType::Select, "SELECT")] ) } #[test] fn it_scans_two_keywords() { assert_eq!( scan("SELECT FROM").unwrap(), vec![ Token::new(TokenType::Select, "SELECT"), Token::new(TokenType::From, "FROM") ] ) } #[test] fn it_scans_an_identifier() { assert_eq!( scan("abc").unwrap(), vec![Token::new(TokenType::Identfiier, "abc")] ) } #[test] fn it_allows_hyphens_in_identifiers() { assert_eq!( scan("abc-def").unwrap(), vec![Token::new(TokenType::Identfiier, "abc-def")] ) } #[test] fn it_scans_a_star() { assert_eq!(scan("*").unwrap(), vec![Token::new(TokenType::Star, "*")]) } #[test] fn it_scans_a_comma() { assert_eq!(scan(",").unwrap(), vec![Token::new(TokenType::Comma, ",")]) } #[test] fn it_allows_dots_in_identifiers() { assert_eq!( scan("foo.bar").unwrap(), vec![Token::new(TokenType::Identfiier, "foo.bar")] ) } #[test] fn it_allows_stars_in_identifiers() { assert_eq!( scan("foo.*").unwrap(), vec![Token::new(TokenType::Identfiier, "foo.*")] ) } #[test] fn it_scans_a_whole_expression() { assert_eq!( scan("SELECT * FROM index").unwrap(), vec![ Token::new(TokenType::Select, "SELECT"), Token::new(TokenType::Star, "*"), Token::new(TokenType::From, "FROM"), Token::new(TokenType::Identfiier, "index"), ] ) } }