Tokenizer working, more tests to do.

This commit is contained in:
Jesse Brault 2023-01-25 13:54:23 +01:00
parent 368e63656e
commit 3fee229003
6 changed files with 220 additions and 138 deletions

View File

@ -2,21 +2,29 @@ package com.jessebrault.gcp.tokenizer;
import java.util.LinkedList;
import java.util.Queue;
import java.util.regex.Pattern;
final class Accumulator {
Queue<Token> tokens = new LinkedList<>();
int line = 1;
int col = 1;
private static final Pattern newline = Pattern.compile("([\n\r])");
private final Queue<Token> tokens = new LinkedList<>();
private int line = 1;
private int col = 1;
public void accumulate(Token.Type type, String text) {
this.tokens.add(new Token(type, text, this.line, this.col));
if (type == Token.Type.NEWLINE) {
this.line++;
final var m = newline.matcher(text);
if (m.find()) {
this.line += m.groupCount();
this.col = 1;
} else {
this.col += text.length();
}
}
public Queue<Token> getTokens() {
return this.tokens;
}
}

View File

@ -5,26 +5,33 @@ import java.util.Collection;
public final class Token {
public enum Type {
LESS_THAN,
GREATER_THAN,
PERCENT,
EQUALS,
DOUBLE_QUOTE,
SINGLE_QUOTE,
FORWARD_SLASH,
IDENTIFIER,
CAPITALIZED_IDENTIFIER,
DOT,
WORD,
WHITESPACE,
NEWLINE,
STRING,
SCRIPTLET,
TEXT,
DOLLAR,
GROOVY_REFERENCE,
CURLY_OPEN,
CURLY_CLOSE
SCRIPTLET,
CURLY_CLOSE,
BLOCK_SCRIPTLET_OPEN,
EXPRESSION_SCRIPTLET_OPEN,
SCRIPTLET_CLOSE,
CLASS_NAME,
PACKAGE_NAME,
DOT,
WHITESPACE,
KEY,
EQUALS,
DOUBLE_QUOTE,
STRING,
SINGLE_QUOTE,
COMPONENT_START,
FORWARD_SLASH,
COMPONENT_END,
;
boolean isAnyOf(Collection<Type> types) {

View File

@ -1,82 +1,23 @@
package com.jessebrault.gcp.tokenizer;
import java.util.LinkedList;
import java.util.Queue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public final class Tokenizer {
private static final Pattern lessThan = Pattern.compile("^<");
private static final Pattern greaterThan = Pattern.compile("^>");
private static final Pattern percent = Pattern.compile("^%");
private static final Pattern equals = Pattern.compile("^=");
private static final Pattern doubleQuote = Pattern.compile("^\"");
private static final Pattern singleQuote = Pattern.compile("^'");
private static final Pattern forwardSlash = Pattern.compile("^/");
private static final Pattern identifier = Pattern.compile("^[\\p{Ll}0-9_$][\\p{L}0-9_$]*");
private static final Pattern capitalizedIdentifier = Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*");
private static final Pattern dot = Pattern.compile("^\\.");
private static final Pattern word = Pattern.compile("^[\\w\\W&&[^\\s\\n\\r]]+");
private static final Pattern whitespace = Pattern.compile("^[\\s&&[^\n\r]]");
private static final Pattern newline = Pattern.compile("^[\\n\\r]");
public static Queue<Token> tokenize(final String gcpSrc) {
Queue<Token> tokens = new LinkedList<>();
final var acc = new Accumulator();
final var fsm = TokenizerFsm.get(acc);
var line = 1;
var col = 1;
String remaining = gcpSrc;
while (remaining.length() > 0) {
Matcher m;
if ((m = lessThan.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.LESS_THAN, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = greaterThan.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.GREATER_THAN, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = percent.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.PERCENT, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = equals.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.EQUALS, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = doubleQuote.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.DOUBLE_QUOTE, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = singleQuote.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.SINGLE_QUOTE, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = identifier.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.IDENTIFIER, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = forwardSlash.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.FORWARD_SLASH, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = capitalizedIdentifier.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.CAPITALIZED_IDENTIFIER, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = dot.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.DOT, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = word.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.WORD, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = whitespace.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.WHITESPACE, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = newline.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.NEWLINE, m.group(0), line, col));
col = 1;
line++;
final var o = fsm.apply(remaining);
if (o == null) {
throw new IllegalStateException();
}
remaining = remaining.substring(m.group(0).length());
remaining = remaining.substring(o.entire().length());
}
return tokens;
return acc.getTokens();
}
}

View File

@ -8,39 +8,85 @@ import static com.jessebrault.gcp.tokenizer.Token.Type.*;
import java.util.regex.Pattern;
class TokenizerFsm {
final class TokenizerFsm {
private static final PatternMatcher lessThan = new PatternMatcher(Pattern.compile("^<"));
private static final PatternMatcher greaterThan = new PatternMatcher(Pattern.compile("^>"));
private static final PatternMatcher percent = new PatternMatcher(Pattern.compile("^%"));
private static final PatternMatcher equals = new PatternMatcher(Pattern.compile("^="));
private static final PatternMatcher forwardSlash = new PatternMatcher(Pattern.compile("^/"));
private static final PatternMatcher identifier = new PatternMatcher(Pattern.compile("^[\\p{Ll}0-9_$][\\p{L}0-9_$]*"));
private static final PatternMatcher capitalizedIdentifier = new PatternMatcher(Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*"));
private static final PatternMatcher dot = new PatternMatcher(Pattern.compile("^\\."));
private static final PatternMatcher word = new PatternMatcher(Pattern.compile("^[\\w\\W&&[^\\s\\n\\r]]+"));
private static final PatternMatcher whitespace = new PatternMatcher(Pattern.compile("^[\\s&&[^\n\r]]"));
private static final PatternMatcher newline = new PatternMatcher(Pattern.compile("^[\\n\\r]"));
private static final PatternMatcher dollarReference = new PatternMatcher(Pattern.compile("^(\\$)([\\w$]+(?:\\.[\\w$]+)*)"));
private static final PatternMatcher openingComponentStart = new PatternMatcher(Pattern.compile("^<(?=\\p{Lu}|\\p{L}+(?:\\.\\p{L}+)+)"));
private static final PatternMatcher doubleQuote = new PatternMatcher(Pattern.compile("^\""));
private static final PatternMatcher singleQuote = new PatternMatcher(Pattern.compile("^'"));
/**
* Text
*/
private static final PatternMatcher text = new PatternMatcher(
Pattern.compile("^(?:[\\w\\W&&[^<$]]|<(?!%|/?\\p{Lu}|/?[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)|\\$(?![\\w$]+(?:\\.[\\w$]+)*))+")
);
/**
* Gsp dollar reference and scriptlets, also used as component values
*/
private static final PatternMatcher dollarReference = new PatternMatcher(
Pattern.compile("^(\\$)([\\w$]+(?:\\.[\\w$]+)*)")
);
private static final DollarScriptletMatcher dollarScriptlet = new DollarScriptletMatcher();
private static final PatternMatcher blockScriptlet = new PatternMatcher(
Pattern.compile("^(<%)(.*?)(%>)")
);
private static final PatternMatcher expressionScriptlet = new PatternMatcher(
Pattern.compile("^(<%=)(.*?)(%>)")
);
/**
* Component starts
*/
private static final PatternMatcher openingComponentStart = new PatternMatcher(
Pattern.compile("^<(?=\\p{Lu}|[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)")
);
private static final PatternMatcher closingComponentStart = new PatternMatcher(
Pattern.compile("^(<)(/)(?=\\p{Lu}|[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)")
);
/**
* Component names
*/
private static final PatternMatcher className = new PatternMatcher(
Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*")
);
private static final PatternMatcher packageName = new PatternMatcher(
Pattern.compile("^[\\p{L}0-9_$]+(?=\\.)")
);
private static final PatternMatcher dot = new PatternMatcher(
Pattern.compile("^\\.")
);
/**
* Whitespace
*/
private static final PatternMatcher whitespace = new PatternMatcher(Pattern.compile("^[\\s&&[^\n\r]]+"));
/**
* Keys and values
*/
private static final PatternMatcher key = new PatternMatcher(
Pattern.compile("^[\\p{L}0-9_$]+")
);
private static final PatternMatcher equals = new PatternMatcher(Pattern.compile("^="));
/**
* Component ends
*/
private static final PatternMatcher forwardSlash = new PatternMatcher(Pattern.compile("^/"));
private static final PatternMatcher componentEnd = new PatternMatcher(Pattern.compile("^>"));
private static FunctionFsmBuilder<String, TokenizerState, FsmOutput> getFsmBuilder() {
return new FunctionFsmBuilderImpl<>();
}
public FunctionFsm<String, TokenizerState, FsmOutput> getFsm(Accumulator acc) {
public static FunctionFsm<String, TokenizerState, FsmOutput> get(Accumulator acc) {
return getFsmBuilder()
.setInitialState(TokenizerState.NORMAL)
.whileIn(TokenizerState.NORMAL, sc -> {
sc.on(dollarReference).exec(r -> {
acc.accumulate(DOLLAR, r.part(1));
acc.accumulate(GROOVY_REFERENCE, r.part(2));
sc.on(text).exec(o -> {
acc.accumulate(TEXT, o.entire());
});
sc.on(dollarReference).exec(o -> {
acc.accumulate(DOLLAR, o.part(1));
acc.accumulate(GROOVY_REFERENCE, o.part(2));
});
sc.on(dollarScriptlet).exec(o -> {
acc.accumulate(DOLLAR, o.part(1));
@ -48,11 +94,92 @@ class TokenizerFsm {
acc.accumulate(SCRIPTLET, o.part(3));
acc.accumulate(CURLY_CLOSE, o.part(4));
});
sc.on(openingComponentStart).shiftTo(TokenizerState.COMPONENT).exec(r -> {
acc.accumulate(LESS_THAN, r.entire());
sc.on(blockScriptlet).exec(o -> {
acc.accumulate(BLOCK_SCRIPTLET_OPEN, o.part(1));
acc.accumulate(SCRIPTLET, o.part(2));
acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
});
sc.on(expressionScriptlet).exec(o -> {
acc.accumulate(EXPRESSION_SCRIPTLET_OPEN, o.part(1));
acc.accumulate(SCRIPTLET, o.part(2));
acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
});
sc.on(openingComponentStart).shiftTo(TokenizerState.COMPONENT_NAME).exec(o ->
acc.accumulate(COMPONENT_START, o.entire())
);
sc.on(closingComponentStart).shiftTo(TokenizerState.COMPONENT_NAME).exec(o -> {
acc.accumulate(COMPONENT_START, o.part(1));
acc.accumulate(FORWARD_SLASH, o.part(2));
});
sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
})
.whileIn(TokenizerState.COMPONENT_NAME, sc -> {
sc.on(packageName).exec(o -> {
acc.accumulate(PACKAGE_NAME, o.entire());
});
sc.on(dot).exec(o -> {
acc.accumulate(DOT, o.entire());
});
sc.on(className).exec(o -> {
acc.accumulate(CLASS_NAME, o.entire());
});
sc.on(forwardSlash).exec(o -> {
acc.accumulate(FORWARD_SLASH, o.entire());
});
sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
acc.accumulate(COMPONENT_END, o.entire());
});
sc.on(whitespace).shiftTo(TokenizerState.COMPONENT_KEYS_AND_VALUES).exec(o -> {
acc.accumulate(WHITESPACE, o.entire());
});
sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
})
.whileIn(TokenizerState.COMPONENT_KEYS_AND_VALUES, sc -> {
sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
acc.accumulate(COMPONENT_END, o.entire());
});
sc.on(whitespace).exec(o -> {
acc.accumulate(WHITESPACE, o.entire());
});
sc.on(key).exec(o -> {
acc.accumulate(KEY, o.entire());
});
sc.on(equals).exec(o -> {
acc.accumulate(EQUALS, o.entire());
});
// sc.on(gString)
// sc.on(singleQuoteString)
sc.on(dollarReference).exec(o -> {
acc.accumulate(DOLLAR, o.part(1));
acc.accumulate(GROOVY_REFERENCE, o.part(2));
});
sc.on(dollarScriptlet).exec(o -> {
acc.accumulate(DOLLAR, o.part(1));
acc.accumulate(CURLY_OPEN, o.part(2));
acc.accumulate(SCRIPTLET, o.part(3));
acc.accumulate(CURLY_CLOSE, o.part(4));
});
sc.on(blockScriptlet).exec(o -> {
acc.accumulate(BLOCK_SCRIPTLET_OPEN, o.part(1));
acc.accumulate(SCRIPTLET, o.part(2));
acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
});
sc.on(expressionScriptlet).exec(o -> {
acc.accumulate(EXPRESSION_SCRIPTLET_OPEN, o.part(1));
acc.accumulate(SCRIPTLET, o.part(2));
acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
});
sc.on(forwardSlash).exec(o -> {
acc.accumulate(FORWARD_SLASH, o.entire());
});
sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
acc.accumulate(COMPONENT_END, o.entire());
});
sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
})
.build();
}
}

View File

@ -2,7 +2,6 @@ package com.jessebrault.gcp.tokenizer;
enum TokenizerState {
NORMAL,
COMPONENT,
G_STRING,
SCRIPTLET
COMPONENT_NAME,
COMPONENT_KEYS_AND_VALUES
}

View File

@ -1,5 +1,6 @@
package com.jessebrault.gcp.tokenizer
import org.junit.jupiter.api.Disabled
import org.junit.jupiter.api.Test
import org.slf4j.Logger
import org.slf4j.LoggerFactory
@ -81,56 +82,55 @@ class TokenizerTests {
}
@Test
void doctypeHtml() {
void doctypeHtmlIsText() {
test('<!DOCTYPE html>') {
expect LESS_THAN, '<', 1, 1
expect WORD, '!DOCTYPE', 1, 2
expect WHITESPACE, ' ', 1, 10
expect IDENTIFIER, 'html', 1, 11
expect GREATER_THAN, '>', 1, 15
expect TEXT, '<!DOCTYPE html>', 1, 1
}
}
@Test
void htmlLangEn() {
void htmlLangEnIsText() {
test('<html lang="en">') {
expect LESS_THAN, '<', 1, 1
expect IDENTIFIER, 'html', 1, 2
expect WHITESPACE, ' ', 1, 6
expect IDENTIFIER, 'lang', 1, 7
expect EQUALS, '=', 1, 11
expect DOUBLE_QUOTE, '"', 1, 12
expect IDENTIFIER, 'en', 1, 13
expect DOUBLE_QUOTE, '"', 1, 15
expect GREATER_THAN, '>', 1, 16
expect TEXT, '<html lang="en">', 1, 1
}
}
@Test
void component() {
test('<Test />') {
expect LESS_THAN, '<', 1, 1
expect CAPITALIZED_IDENTIFIER, 'Test', 1, 2
expect COMPONENT_START, '<', 1, 1
expect CLASS_NAME, 'Test', 1, 2
expect WHITESPACE, ' ', 1, 6
expect FORWARD_SLASH, '/', 1, 7
expect GREATER_THAN, '>', 1, 8
expect COMPONENT_END, '>', 1, 8
}
}
@Test
@Disabled
void componentWithKeysAndValues() {
test('<Test test="test" />') {
expect LESS_THAN, '<', 1, 1
expect CAPITALIZED_IDENTIFIER, 'Test', 1, 2
expect COMPONENT_START, '<', 1, 1
expect CLASS_NAME, 'Test', 1, 2
expect WHITESPACE, ' ', 1, 6
expect IDENTIFIER, 'test', 1, 7
expect KEY, 'test', 1, 7
expect EQUALS, '=', 1, 11
expect DOUBLE_QUOTE, '"', 1, 12
expect STRING, 'test', 1, 13
expect DOUBLE_QUOTE, '"', 1, 17
expect WHITESPACE, ' ', 1, 18
expect FORWARD_SLASH, '/', 1, 19
expect GREATER_THAN, '>', 1, 20
expect COMPONENT_END, '>', 1, 20
}
}
@Test
void newlinesCounted() {
test('Hello,\n$person!') {
expect TEXT, 'Hello,\n', 1, 1
expect DOLLAR, '$', 2, 1
expect GROOVY_REFERENCE, 'person', 2, 2
expect TEXT, '!', 2, 8
}
}