diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index ba6f1e4..8f07ee3 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -1,3 +1,6 @@ +use std::iter::Peekable; +use std::str::Chars; + #[derive(Debug, PartialEq, Eq)] pub enum Token { Namespace, @@ -7,39 +10,127 @@ pub enum Token { CurlyOpen, CurlyClose, Interface, + Colon, + Function, + ParenOpen, + ParenClose, + Enum, + LessThan, + GreaterThan, + Intersection, + Union, + And, + Or, + Equals, + BigArrow, + LittleArrow, + Plus, + Minus, + Dot, + Ellipsis, + Abstract } pub fn tokenize(input: &String) -> Vec { let mut tokens: Vec = Vec::new(); - let mut buffer = String::new(); let mut peekable = input.chars().peekable(); while let Some(c) = peekable.next() { - // if not whitespace, append to buffer - // else, we should match a token - if !c.is_whitespace() { - buffer.push(c); - } else { - tokens.push(match_buffer(&mut buffer)); - buffer.clear(); - } - - // check if eof - if peekable.peek().is_none() { - tokens.push(match_buffer(&mut buffer)); + match c { + ' ' | '\n' | '\r' | '\t' => { /* ignore */ } + '{' => tokens.push(Token::CurlyOpen), + '}' => tokens.push(Token::CurlyClose), + ':' => tokens.push(Token::Colon), + '(' => tokens.push(Token::ParenOpen), + ')' => tokens.push(Token::ParenClose), + '<' => tokens.push(Token::LessThan), + '>' => tokens.push(Token::GreaterThan), + '&' => { + match peekable.peek() { + Some('&') => { + let _ = peekable.next(); + tokens.push(Token::And); + }, + Some(_) | None => tokens.push(Token::Intersection), + } + } + '|' => { + match peekable.next_if_eq(&'|') { + Some(_) => tokens.push(Token::Or), + None => tokens.push(Token::Union), + } + }, + '=' => { + match peekable.next_if_eq(&'>') { + Some(_) => tokens.push(Token::BigArrow), + None => tokens.push(Token::Equals), + } + }, + '+' => tokens.push(Token::Plus), + '-' => { + match peekable.next_if_eq(&'>') { + Some(_) => tokens.push(Token::LittleArrow), + None => tokens.push(Token::Minus), + } + }, + '.' => { + let mut count = 1; + while let Some(_) = peekable.next_if_eq(&'.') { + count += 1; + } + match count { + 1 => tokens.push(Token::Dot), + 3 => tokens.push(Token::Ellipsis), + _ => panic!("Too many dots.") + } + } + _ => { + if let Some(token) = match_identifier_or_keyword(c, &mut peekable) { + tokens.push(token); + } + } } } tokens } -fn match_buffer(buffer: &mut String) -> Token { +fn match_identifier_or_keyword(start_char: char, peekable: &mut Peekable) -> Option { + if !is_valid_identifier_start_char(start_char) { + return None + } + + // append start char + let mut buffer = String::new(); + buffer.push(start_char); + + // munch while we have valid identifier chars + while let Some(c) = peekable.next_if(|next_char| is_valid_identifier_char(*next_char)) { + buffer.push(c); + } + + // match to a keyword if possible, else identifier match buffer.as_str() { - "int" => Token::Interface, - "mod" => Token::Module, - "ns" => Token::Namespace, - "pub" => Token::Public, - "{" => Token::CurlyOpen, - "}" => Token::CurlyClose, - identifier => Token::Identifier(identifier.to_string()), + "abs" => Some(Token::Abstract), + "enum" => Some(Token::Enum), + "fn" => Some(Token::Function), + "int" => Some(Token::Interface), + "mod" => Some(Token::Module), + "ns" => Some(Token::Namespace), + "pub" => Some(Token::Public), + _ => Some(Token::Identifier(buffer)), + } +} + +fn is_valid_identifier_start_char(c: char) -> bool { + match c { + 'a'..='z' | 'A'..='Z' | '_' => true, + _ => false, + } +} + +fn is_valid_identifier_char(c: char) -> bool { + match c { + 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => true, + _ => false, } } @@ -100,4 +191,39 @@ mod tests { assert_eq!(Token::CurlyOpen, result[5]); assert_eq!(Token::CurlyClose, result[6]); } + + #[test] + fn curly_open_and_close_no_space() { + let result = tokenize(&String::from("{}")); + assert_eq!(Token::CurlyOpen, result[0]); + assert_eq!(Token::CurlyClose, result[1]); + } + + #[test] + fn interface_function() { + let result = tokenize(&String::from("fn test(): Test")); + assert_eq!(Token::Function, result[0]); + assert_eq!(Token::Identifier(String::from("test")), result[1]); + assert_eq!(Token::ParenOpen, result[2]); + assert_eq!(Token::ParenClose, result[3]); + assert_eq!(Token::Colon, result[4]); + assert_eq!(Token::Identifier(String::from("Test")), result[5]); + } + + #[test] + fn interface_prop() { + let result = tokenize(&String::from("test: Test")); + assert_eq!(Token::Identifier(String::from("test")), result[0]); + assert_eq!(Token::Colon, result[1]); + assert_eq!(Token::Identifier(String::from("Test")), result[2]); + } + + #[test] + fn enum_decl() { + let result = tokenize(&String::from("enum Test {}")); + assert_eq!(Token::Enum, result[0]); + assert_eq!(Token::Identifier(String::from("Test")), result[1]); + assert_eq!(Token::CurlyOpen, result[2]); + assert_eq!(Token::CurlyClose, result[3]); + } }