A tool to compile SQL to Elasticsearch queries
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lexer.rs 4.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. use lazy_static::lazy_static;
  2. use regex::Regex;
  3. use crate::error::KappeError;
  4. use crate::token::{Token, TokenType};
  5. #[derive(Debug)]
  6. struct Matcher {
  7. regex: Regex,
  8. token_type: TokenType,
  9. }
  10. impl Matcher {
  11. pub fn new(regex: &str, token_type: TokenType) -> Self {
  12. Self {
  13. regex: Regex::new(regex).unwrap(),
  14. token_type,
  15. }
  16. }
  17. }
  18. lazy_static! {
  19. static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s").unwrap();
  20. static ref MATCHERS: Vec<Matcher> = vec![
  21. Matcher::new(r#"^"(.*)""#, TokenType::String),
  22. Matcher::new(r#"(?i)^SELECT"#, TokenType::Select),
  23. Matcher::new(r#"(?i)^FROM"#, TokenType::From),
  24. Matcher::new(r#"(?i)^WHERE"#, TokenType::Where),
  25. Matcher::new(r#"^[a-z][a-zA-Z_\-.*]*"#, TokenType::Identfiier),
  26. Matcher::new(r#"^[0-9]+"#, TokenType::Number),
  27. Matcher::new(r#"^\*"#, TokenType::Star),
  28. Matcher::new(r#"^,"#, TokenType::Comma),
  29. Matcher::new(r#"^="#, TokenType::Equals),
  30. ];
  31. }
  32. pub fn scan(input: &str) -> Result<Vec<Token>, KappeError> {
  33. let mut tokens: Vec<Token> = vec![];
  34. let mut position = 0;
  35. while position < input.len() {
  36. while WHITESPACE_REGEX.is_match(&input[position..position + 1]) {
  37. position += 1;
  38. }
  39. let mut matched = false;
  40. for matcher in MATCHERS.iter() {
  41. if matcher.regex.is_match(&input[position..]) {
  42. if let Some(m) = matcher.regex.captures_iter(&input[position..]).next() {
  43. let value = if m.len() > 1 { &m[1] } else { &m[0] };
  44. position += value.len();
  45. if matcher.token_type == TokenType::String {
  46. position += 2
  47. };
  48. tokens.push(Token::new(matcher.token_type, value));
  49. }
  50. matched = true;
  51. break;
  52. }
  53. }
  54. if !matched {
  55. return Err(KappeError::new("Unrecognized sequence"));
  56. }
  57. }
  58. Ok(tokens)
  59. }
  60. #[cfg(test)]
  61. mod tests {
  62. use super::*;
  63. #[test]
  64. fn it_scans_a_number() {
  65. assert_eq!(
  66. scan("123").unwrap(),
  67. vec![Token::new(TokenType::Number, "123")]
  68. )
  69. }
  70. #[test]
  71. fn it_scans_a_string() {
  72. assert_eq!(
  73. scan("\"hello world\"").unwrap(),
  74. vec![Token::new(TokenType::String, "hello world")]
  75. )
  76. }
  77. #[test]
  78. fn it_scans_uppercase_keywords() {
  79. assert_eq!(
  80. scan("SELECT FROM WHERE").unwrap(),
  81. vec![
  82. Token::new(TokenType::Select, "SELECT"),
  83. Token::new(TokenType::From, "FROM"),
  84. Token::new(TokenType::Where, "WHERE"),
  85. ]
  86. )
  87. }
  88. #[test]
  89. fn it_scans_lowercase_keywords() {
  90. assert_eq!(
  91. scan("select from where").unwrap(),
  92. vec![
  93. Token::new(TokenType::Select, "select"),
  94. Token::new(TokenType::From, "from"),
  95. Token::new(TokenType::Where, "where"),
  96. ]
  97. )
  98. }
  99. #[test]
  100. fn it_scans_an_identifier() {
  101. assert_eq!(
  102. scan("abc").unwrap(),
  103. vec![Token::new(TokenType::Identfiier, "abc")]
  104. )
  105. }
  106. #[test]
  107. fn it_allows_hyphens_in_identifiers() {
  108. assert_eq!(
  109. scan("abc-def").unwrap(),
  110. vec![Token::new(TokenType::Identfiier, "abc-def")]
  111. )
  112. }
  113. #[test]
  114. fn it_scans_a_star() {
  115. assert_eq!(scan("*").unwrap(), vec![Token::new(TokenType::Star, "*")])
  116. }
  117. #[test]
  118. fn it_scans_a_comma() {
  119. assert_eq!(scan(",").unwrap(), vec![Token::new(TokenType::Comma, ",")])
  120. }
  121. #[test]
  122. fn it_scans_an_equals_sign() {
  123. assert_eq!(scan("=").unwrap(), vec![Token::new(TokenType::Equals, "=")])
  124. }
  125. #[test]
  126. fn it_allows_dots_in_identifiers() {
  127. assert_eq!(
  128. scan("foo.bar").unwrap(),
  129. vec![Token::new(TokenType::Identfiier, "foo.bar")]
  130. )
  131. }
  132. #[test]
  133. fn it_allows_stars_in_identifiers() {
  134. assert_eq!(
  135. scan("foo.*").unwrap(),
  136. vec![Token::new(TokenType::Identfiier, "foo.*")]
  137. )
  138. }
  139. #[test]
  140. fn it_scans_a_whole_expression() {
  141. assert_eq!(
  142. scan("SELECT * FROM index").unwrap(),
  143. vec![
  144. Token::new(TokenType::Select, "SELECT"),
  145. Token::new(TokenType::Star, "*"),
  146. Token::new(TokenType::From, "FROM"),
  147. Token::new(TokenType::Identfiier, "index"),
  148. ]
  149. )
  150. }
  151. }