A tool to compile SQL to Elasticsearch queries
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lexer.rs 4.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. use lazy_static::lazy_static;
  2. use regex::Regex;
  3. use crate::error::KappeError;
  4. use crate::token::{Token, TokenType};
  5. #[derive(Debug)]
  6. struct Matcher {
  7. regex: Regex,
  8. token_type: TokenType,
  9. }
  10. impl Matcher {
  11. pub fn new(regex: &str, token_type: TokenType) -> Self {
  12. Self {
  13. regex: Regex::new(regex).unwrap(),
  14. token_type,
  15. }
  16. }
  17. }
  18. lazy_static! {
  19. static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s").unwrap();
  20. static ref MATCHERS: Vec<Matcher> = vec![
  21. Matcher::new(r#"^"(.*)""#, TokenType::String),
  22. Matcher::new(r#"^SELECT"#, TokenType::Select),
  23. Matcher::new(r#"^FROM"#, TokenType::From),
  24. Matcher::new(r#"^[a-z][a-zA-Z_\-.*]*"#, TokenType::Identfiier),
  25. Matcher::new(r#"^[0-9]+"#, TokenType::Number),
  26. Matcher::new(r#"^\*"#, TokenType::Star),
  27. Matcher::new(r#"^,"#, TokenType::Comma),
  28. ];
  29. }
  30. pub fn scan(input: &str) -> Result<Vec<Token>, KappeError> {
  31. let mut tokens: Vec<Token> = vec![];
  32. let mut position = 0;
  33. while position < input.len() {
  34. while WHITESPACE_REGEX.is_match(&input[position..position + 1]) {
  35. position += 1;
  36. }
  37. let mut matched = false;
  38. for matcher in MATCHERS.iter() {
  39. if matcher.regex.is_match(&input[position..]) {
  40. if let Some(m) = matcher.regex.captures_iter(&input[position..]).next() {
  41. let value = if m.len() > 1 { &m[1] } else { &m[0] };
  42. position += value.len();
  43. if matcher.token_type == TokenType::String {
  44. position += 2
  45. };
  46. tokens.push(Token::new(matcher.token_type, value));
  47. }
  48. matched = true;
  49. break;
  50. }
  51. }
  52. if !matched {
  53. return Err(KappeError::new("Unrecognized sequence"));
  54. }
  55. }
  56. Ok(tokens)
  57. }
  58. #[cfg(test)]
  59. mod tests {
  60. use super::*;
  61. #[test]
  62. fn it_scans_a_number() {
  63. assert_eq!(
  64. scan("123").unwrap(),
  65. vec![Token::new(TokenType::Number, "123")]
  66. )
  67. }
  68. #[test]
  69. fn it_scans_a_string() {
  70. assert_eq!(
  71. scan("\"hello world\"").unwrap(),
  72. vec![Token::new(TokenType::String, "hello world")]
  73. )
  74. }
  75. #[test]
  76. fn it_scans_a_keyword() {
  77. assert_eq!(
  78. scan("SELECT").unwrap(),
  79. vec![Token::new(TokenType::Select, "SELECT")]
  80. )
  81. }
  82. #[test]
  83. fn it_scans_two_keywords() {
  84. assert_eq!(
  85. scan("SELECT FROM").unwrap(),
  86. vec![
  87. Token::new(TokenType::Select, "SELECT"),
  88. Token::new(TokenType::From, "FROM")
  89. ]
  90. )
  91. }
  92. #[test]
  93. fn it_scans_an_identifier() {
  94. assert_eq!(
  95. scan("abc").unwrap(),
  96. vec![Token::new(TokenType::Identfiier, "abc")]
  97. )
  98. }
  99. #[test]
  100. fn it_allows_hyphens_in_identifiers() {
  101. assert_eq!(
  102. scan("abc-def").unwrap(),
  103. vec![Token::new(TokenType::Identfiier, "abc-def")]
  104. )
  105. }
  106. #[test]
  107. fn it_scans_a_star() {
  108. assert_eq!(scan("*").unwrap(), vec![Token::new(TokenType::Star, "*")])
  109. }
  110. #[test]
  111. fn it_scans_a_comma() {
  112. assert_eq!(scan(",").unwrap(), vec![Token::new(TokenType::Comma, ",")])
  113. }
  114. #[test]
  115. fn it_allows_dots_in_identifiers() {
  116. assert_eq!(
  117. scan("foo.bar").unwrap(),
  118. vec![Token::new(TokenType::Identfiier, "foo.bar")]
  119. )
  120. }
  121. #[test]
  122. fn it_allows_stars_in_identifiers() {
  123. assert_eq!(
  124. scan("foo.*").unwrap(),
  125. vec![Token::new(TokenType::Identfiier, "foo.*")]
  126. )
  127. }
  128. #[test]
  129. fn it_scans_a_whole_expression() {
  130. assert_eq!(
  131. scan("SELECT * FROM index").unwrap(),
  132. vec![
  133. Token::new(TokenType::Select, "SELECT"),
  134. Token::new(TokenType::Star, "*"),
  135. Token::new(TokenType::From, "FROM"),
  136. Token::new(TokenType::Identfiier, "index"),
  137. ]
  138. )
  139. }
  140. }