Adding more keywords and symbols to lexer.

This commit is contained in:
Jesse Brault 2024-11-24 20:50:27 -06:00
parent b3177a612f
commit 88119e3001

View File

@ -1,3 +1,6 @@
use std::iter::Peekable;
use std::str::Chars;
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub enum Token { pub enum Token {
Namespace, Namespace,
@ -7,39 +10,127 @@ pub enum Token {
CurlyOpen, CurlyOpen,
CurlyClose, CurlyClose,
Interface, Interface,
Colon,
Function,
ParenOpen,
ParenClose,
Enum,
LessThan,
GreaterThan,
Intersection,
Union,
And,
Or,
Equals,
BigArrow,
LittleArrow,
Plus,
Minus,
Dot,
Ellipsis,
Abstract
} }
pub fn tokenize(input: &String) -> Vec<Token> { pub fn tokenize(input: &String) -> Vec<Token> {
let mut tokens: Vec<Token> = Vec::new(); let mut tokens: Vec<Token> = Vec::new();
let mut buffer = String::new();
let mut peekable = input.chars().peekable(); let mut peekable = input.chars().peekable();
while let Some(c) = peekable.next() { while let Some(c) = peekable.next() {
// if not whitespace, append to buffer match c {
// else, we should match a token ' ' | '\n' | '\r' | '\t' => { /* ignore */ }
if !c.is_whitespace() { '{' => tokens.push(Token::CurlyOpen),
buffer.push(c); '}' => tokens.push(Token::CurlyClose),
} else { ':' => tokens.push(Token::Colon),
tokens.push(match_buffer(&mut buffer)); '(' => tokens.push(Token::ParenOpen),
buffer.clear(); ')' => tokens.push(Token::ParenClose),
'<' => tokens.push(Token::LessThan),
'>' => tokens.push(Token::GreaterThan),
'&' => {
match peekable.peek() {
Some('&') => {
let _ = peekable.next();
tokens.push(Token::And);
},
Some(_) | None => tokens.push(Token::Intersection),
}
}
'|' => {
match peekable.next_if_eq(&'|') {
Some(_) => tokens.push(Token::Or),
None => tokens.push(Token::Union),
}
},
'=' => {
match peekable.next_if_eq(&'>') {
Some(_) => tokens.push(Token::BigArrow),
None => tokens.push(Token::Equals),
}
},
'+' => tokens.push(Token::Plus),
'-' => {
match peekable.next_if_eq(&'>') {
Some(_) => tokens.push(Token::LittleArrow),
None => tokens.push(Token::Minus),
}
},
'.' => {
let mut count = 1;
while let Some(_) = peekable.next_if_eq(&'.') {
count += 1;
}
match count {
1 => tokens.push(Token::Dot),
3 => tokens.push(Token::Ellipsis),
_ => panic!("Too many dots.")
}
}
_ => {
if let Some(token) = match_identifier_or_keyword(c, &mut peekable) {
tokens.push(token);
}
} }
// check if eof
if peekable.peek().is_none() {
tokens.push(match_buffer(&mut buffer));
} }
} }
tokens tokens
} }
fn match_buffer(buffer: &mut String) -> Token { fn match_identifier_or_keyword(start_char: char, peekable: &mut Peekable<Chars>) -> Option<Token> {
if !is_valid_identifier_start_char(start_char) {
return None
}
// append start char
let mut buffer = String::new();
buffer.push(start_char);
// munch while we have valid identifier chars
while let Some(c) = peekable.next_if(|next_char| is_valid_identifier_char(*next_char)) {
buffer.push(c);
}
// match to a keyword if possible, else identifier
match buffer.as_str() { match buffer.as_str() {
"int" => Token::Interface, "abs" => Some(Token::Abstract),
"mod" => Token::Module, "enum" => Some(Token::Enum),
"ns" => Token::Namespace, "fn" => Some(Token::Function),
"pub" => Token::Public, "int" => Some(Token::Interface),
"{" => Token::CurlyOpen, "mod" => Some(Token::Module),
"}" => Token::CurlyClose, "ns" => Some(Token::Namespace),
identifier => Token::Identifier(identifier.to_string()), "pub" => Some(Token::Public),
_ => Some(Token::Identifier(buffer)),
}
}
fn is_valid_identifier_start_char(c: char) -> bool {
match c {
'a'..='z' | 'A'..='Z' | '_' => true,
_ => false,
}
}
fn is_valid_identifier_char(c: char) -> bool {
match c {
'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => true,
_ => false,
} }
} }
@ -100,4 +191,39 @@ mod tests {
assert_eq!(Token::CurlyOpen, result[5]); assert_eq!(Token::CurlyOpen, result[5]);
assert_eq!(Token::CurlyClose, result[6]); assert_eq!(Token::CurlyClose, result[6]);
} }
#[test]
fn curly_open_and_close_no_space() {
let result = tokenize(&String::from("{}"));
assert_eq!(Token::CurlyOpen, result[0]);
assert_eq!(Token::CurlyClose, result[1]);
}
#[test]
fn interface_function() {
let result = tokenize(&String::from("fn test(): Test"));
assert_eq!(Token::Function, result[0]);
assert_eq!(Token::Identifier(String::from("test")), result[1]);
assert_eq!(Token::ParenOpen, result[2]);
assert_eq!(Token::ParenClose, result[3]);
assert_eq!(Token::Colon, result[4]);
assert_eq!(Token::Identifier(String::from("Test")), result[5]);
}
#[test]
fn interface_prop() {
let result = tokenize(&String::from("test: Test"));
assert_eq!(Token::Identifier(String::from("test")), result[0]);
assert_eq!(Token::Colon, result[1]);
assert_eq!(Token::Identifier(String::from("Test")), result[2]);
}
#[test]
fn enum_decl() {
let result = tokenize(&String::from("enum Test {}"));
assert_eq!(Token::Enum, result[0]);
assert_eq!(Token::Identifier(String::from("Test")), result[1]);
assert_eq!(Token::CurlyOpen, result[2]);
assert_eq!(Token::CurlyClose, result[3]);
}
} }