A tool to compile SQL to Elasticsearch queries
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lexer.rs 4.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. use lazy_static::lazy_static;
  2. use regex::Regex;
  3. use crate::error::KappeError;
  4. use crate::token::{Token, TokenType};
  5. #[derive(Debug)]
  6. struct Matcher {
  7. regex: Regex,
  8. token_type: TokenType,
  9. }
  10. impl Matcher {
  11. pub fn new(regex: &str, token_type: TokenType) -> Self {
  12. Self {
  13. regex: Regex::new(regex).unwrap(),
  14. token_type,
  15. }
  16. }
  17. }
  18. lazy_static! {
  19. static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s").unwrap();
  20. static ref MATCHERS: Vec<Matcher> = vec![
  21. Matcher::new(r#"^"([^"]*)""#, TokenType::String),
  22. Matcher::new(r#"(?i)^SELECT"#, TokenType::Select),
  23. Matcher::new(r#"(?i)^FROM"#, TokenType::From),
  24. Matcher::new(r#"(?i)^WHERE"#, TokenType::Where),
  25. Matcher::new(r#"^[a-z][a-zA-Z_\-.*]*"#, TokenType::Identfiier),
  26. Matcher::new(r#"^[0-9]+"#, TokenType::Number),
  27. Matcher::new(r#"^\*"#, TokenType::Star),
  28. Matcher::new(r#"^,"#, TokenType::Comma),
  29. Matcher::new(r#"^="#, TokenType::Equals),
  30. ];
  31. }
  32. pub fn scan(input: &str) -> Result<Vec<Token>, KappeError> {
  33. let mut tokens: Vec<Token> = vec![];
  34. let mut position = 0;
  35. while position < input.len() {
  36. while WHITESPACE_REGEX.is_match(&input[position..position + 1]) {
  37. position += 1;
  38. }
  39. let mut matched = false;
  40. for matcher in MATCHERS.iter() {
  41. if matcher.regex.is_match(&input[position..]) {
  42. if let Some(m) = matcher.regex.captures_iter(&input[position..]).next() {
  43. let value = if m.len() > 1 { &m[1] } else { &m[0] };
  44. position += value.len();
  45. if matcher.token_type == TokenType::String {
  46. position += 2
  47. };
  48. tokens.push(Token::new(matcher.token_type, value));
  49. }
  50. matched = true;
  51. break;
  52. }
  53. }
  54. if !matched {
  55. return Err(KappeError::new("Unrecognized sequence"));
  56. }
  57. }
  58. Ok(tokens)
  59. }
  60. #[cfg(test)]
  61. mod tests {
  62. use super::*;
  63. #[test]
  64. fn it_scans_a_number() {
  65. assert_eq!(
  66. scan("123").unwrap(),
  67. vec![Token::new(TokenType::Number, "123")]
  68. )
  69. }
  70. #[test]
  71. fn it_scans_a_string() {
  72. assert_eq!(
  73. scan("\"hello world\"").unwrap(),
  74. vec![Token::new(TokenType::String, "hello world")]
  75. )
  76. }
  77. #[test]
  78. fn it_scans_two_strings() {
  79. assert_eq!(
  80. scan("\"hello\" \"world\"").unwrap(),
  81. vec![
  82. Token::new(TokenType::String, "hello"),
  83. Token::new(TokenType::String, "world"),
  84. ]
  85. )
  86. }
  87. #[test]
  88. fn it_scans_uppercase_keywords() {
  89. assert_eq!(
  90. scan("SELECT FROM WHERE").unwrap(),
  91. vec![
  92. Token::new(TokenType::Select, "SELECT"),
  93. Token::new(TokenType::From, "FROM"),
  94. Token::new(TokenType::Where, "WHERE"),
  95. ]
  96. )
  97. }
  98. #[test]
  99. fn it_scans_lowercase_keywords() {
  100. assert_eq!(
  101. scan("select from where").unwrap(),
  102. vec![
  103. Token::new(TokenType::Select, "select"),
  104. Token::new(TokenType::From, "from"),
  105. Token::new(TokenType::Where, "where"),
  106. ]
  107. )
  108. }
  109. #[test]
  110. fn it_scans_an_identifier() {
  111. assert_eq!(
  112. scan("abc").unwrap(),
  113. vec![Token::new(TokenType::Identfiier, "abc")]
  114. )
  115. }
  116. #[test]
  117. fn it_allows_hyphens_in_identifiers() {
  118. assert_eq!(
  119. scan("abc-def").unwrap(),
  120. vec![Token::new(TokenType::Identfiier, "abc-def")]
  121. )
  122. }
  123. #[test]
  124. fn it_scans_a_star() {
  125. assert_eq!(scan("*").unwrap(), vec![Token::new(TokenType::Star, "*")])
  126. }
  127. #[test]
  128. fn it_scans_a_comma() {
  129. assert_eq!(scan(",").unwrap(), vec![Token::new(TokenType::Comma, ",")])
  130. }
  131. #[test]
  132. fn it_scans_an_equals_sign() {
  133. assert_eq!(scan("=").unwrap(), vec![Token::new(TokenType::Equals, "=")])
  134. }
  135. #[test]
  136. fn it_allows_dots_in_identifiers() {
  137. assert_eq!(
  138. scan("foo.bar").unwrap(),
  139. vec![Token::new(TokenType::Identfiier, "foo.bar")]
  140. )
  141. }
  142. #[test]
  143. fn it_allows_stars_in_identifiers() {
  144. assert_eq!(
  145. scan("foo.*").unwrap(),
  146. vec![Token::new(TokenType::Identfiier, "foo.*")]
  147. )
  148. }
  149. #[test]
  150. fn it_scans_a_whole_expression() {
  151. assert_eq!(
  152. scan("SELECT * FROM index").unwrap(),
  153. vec![
  154. Token::new(TokenType::Select, "SELECT"),
  155. Token::new(TokenType::Star, "*"),
  156. Token::new(TokenType::From, "FROM"),
  157. Token::new(TokenType::Identfiier, "index"),
  158. ]
  159. )
  160. }
  161. }