use std::iter::Peekable; use std::str::Chars; #[derive(Debug, PartialEq, Eq)] pub enum Token { Namespace, Identifier(String), Public, Module, CurlyOpen, CurlyClose, Interface, Colon, Function, ParenOpen, ParenClose, Enum, LessThan, GreaterThan, Intersection, Union, And, Or, Equals, BigArrow, LittleArrow, Plus, Minus, Dot, Ellipsis, Abstract, } pub fn tokenize(input: &String) -> Result, String> { let mut tokens: Vec = Vec::new(); let mut peekable = input.chars().peekable(); while let Some(c) = peekable.next() { match c { ' ' | '\n' | '\r' | '\t' => { /* ignore */ } '{' => tokens.push(Token::CurlyOpen), '}' => tokens.push(Token::CurlyClose), ':' => tokens.push(Token::Colon), '(' => tokens.push(Token::ParenOpen), ')' => tokens.push(Token::ParenClose), '<' => tokens.push(Token::LessThan), '>' => tokens.push(Token::GreaterThan), '&' => match peekable.peek() { Some('&') => { let _ = peekable.next(); tokens.push(Token::And); } Some(_) | None => tokens.push(Token::Intersection), }, '|' => match peekable.next_if_eq(&'|') { Some(_) => tokens.push(Token::Or), None => tokens.push(Token::Union), }, '=' => match peekable.next_if_eq(&'>') { Some(_) => tokens.push(Token::BigArrow), None => tokens.push(Token::Equals), }, '+' => tokens.push(Token::Plus), '-' => match peekable.next_if_eq(&'>') { Some(_) => tokens.push(Token::LittleArrow), None => tokens.push(Token::Minus), }, '.' => { let mut count = 1; while let Some(_) = peekable.next_if_eq(&'.') { count += 1; } match count { 1 => tokens.push(Token::Dot), 3 => tokens.push(Token::Ellipsis), _ => return Err(String::from("Unexpected number of tokens after '.'")), } } _ => { if let Some(token) = match_identifier_or_keyword(c, &mut peekable) { tokens.push(token); } else { return Err(String::from(format!("Unexpected token: {}", c))); } } } } Ok(tokens) } fn match_identifier_or_keyword(start_char: char, peekable: &mut Peekable) -> Option { if !is_valid_identifier_start_char(start_char) { return None; } // append start char let mut buffer = String::new(); buffer.push(start_char); // munch while we have valid identifier chars while let Some(c) = peekable.next_if(|next_char| is_valid_identifier_char(*next_char)) { buffer.push(c); } // match to a keyword if possible, else identifier match buffer.as_str() { "abs" => Some(Token::Abstract), "enum" => Some(Token::Enum), "fn" => Some(Token::Function), "int" => Some(Token::Interface), "mod" => Some(Token::Module), "ns" => Some(Token::Namespace), "pub" => Some(Token::Public), _ => Some(Token::Identifier(buffer)), } } fn is_valid_identifier_start_char(c: char) -> bool { match c { 'a'..='z' | 'A'..='Z' | '_' => true, _ => false, } } fn is_valid_identifier_char(c: char) -> bool { match c { 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => true, _ => false, } } #[cfg(test)] mod tests { use super::*; use std::fs::File; use std::io::Read; use std::path::Path; #[test] fn simple_ns() { let result = tokenize(&String::from("ns simple")).unwrap(); assert_eq!(Token::Namespace, result[0]); assert_eq!(Token::Identifier(String::from("simple")), result[1]); } #[test] #[cfg_attr(miri, ignore)] fn simple_ns_file() { let mut src_file = File::open(Path::new("test-data/lexer/simple_ns.dm")).unwrap(); let mut src = String::new(); let _ = src_file.read_to_string(&mut src); let result = tokenize(&src).unwrap(); assert_eq!(Token::Namespace, result[0]); assert_eq!(Token::Identifier(String::from("simple")), result[1]); } #[test] fn pub_mod_simple() { let result = tokenize(&String::from("pub mod simple")).unwrap(); assert_eq!(Token::Public, result[0]); assert_eq!(Token::Module, result[1]); assert_eq!(Token::Identifier(String::from("simple")), result[2]); } #[test] fn curly_open_and_close() { let result = tokenize(&String::from("{ }")).unwrap(); assert_eq!(Token::CurlyOpen, result[0]); assert_eq!(Token::CurlyClose, result[1]); } #[test] fn simple_int() { let result = tokenize(&String::from("int simple")).unwrap(); assert_eq!(Token::Interface, result[0]); assert_eq!(Token::Identifier(String::from("simple")), result[1]); } #[test] fn ns_pub_mod_simple() { let result = tokenize(&String::from("ns simple_ns\npub mod simple { }")).unwrap(); assert_eq!(Token::Namespace, result[0]); assert_eq!(Token::Identifier(String::from("simple_ns")), result[1]); assert_eq!(Token::Public, result[2]); assert_eq!(Token::Module, result[3]); assert_eq!(Token::Identifier(String::from("simple")), result[4]); assert_eq!(Token::CurlyOpen, result[5]); assert_eq!(Token::CurlyClose, result[6]); } #[test] fn curly_open_and_close_no_space() { let result = tokenize(&String::from("{}")).unwrap(); assert_eq!(Token::CurlyOpen, result[0]); assert_eq!(Token::CurlyClose, result[1]); } #[test] fn interface_function() { let result = tokenize(&String::from("fn test(): Test")).unwrap(); assert_eq!(Token::Function, result[0]); assert_eq!(Token::Identifier(String::from("test")), result[1]); assert_eq!(Token::ParenOpen, result[2]); assert_eq!(Token::ParenClose, result[3]); assert_eq!(Token::Colon, result[4]); assert_eq!(Token::Identifier(String::from("Test")), result[5]); } #[test] fn interface_prop() { let result = tokenize(&String::from("test: Test")).unwrap(); assert_eq!(Token::Identifier(String::from("test")), result[0]); assert_eq!(Token::Colon, result[1]); assert_eq!(Token::Identifier(String::from("Test")), result[2]); } #[test] fn enum_decl() { let result = tokenize(&String::from("enum Test {}")).unwrap(); assert_eq!(Token::Enum, result[0]); assert_eq!(Token::Identifier(String::from("Test")), result[1]); assert_eq!(Token::CurlyOpen, result[2]); assert_eq!(Token::CurlyClose, result[3]); } #[test] fn spread_operator() { let result = tokenize(&String::from("{ ...props }")).unwrap(); assert_eq!(Token::CurlyOpen, result[0]); assert_eq!(Token::Ellipsis, result[1]); assert_eq!(Token::Identifier(String::from("props")), result[2]); assert_eq!(Token::CurlyClose, result[3]); } }