Working on Java Tokenizer.

This commit is contained in:
Jesse Brault 2023-01-24 23:20:17 +01:00
parent 67cbb11dc3
commit 368e63656e
13 changed files with 743 additions and 0 deletions

View File

@ -3,4 +3,7 @@
<Head val=${ frontMatter.test } val=$test val="Hello!" val='Hello!'>
<meta val=${ binding.someVar } />
</Head>
<body>
</body>
</html>

View File

@ -0,0 +1,22 @@
package com.jessebrault.gcp.tokenizer;
import java.util.LinkedList;
import java.util.Queue;
final class Accumulator {
Queue<Token> tokens = new LinkedList<>();
int line = 1;
int col = 1;
public void accumulate(Token.Type type, String text) {
this.tokens.add(new Token(type, text, this.line, this.col));
if (type == Token.Type.NEWLINE) {
this.line++;
this.col = 1;
} else {
this.col += text.length();
}
}
}

View File

@ -0,0 +1,233 @@
package com.jessebrault.gcp.tokenizer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.function.Function;
/**
* NOT THREAD SAFE
*/
final class DollarScriptletMatcher implements Function<String, FsmOutput> {
private static final Logger logger = LoggerFactory.getLogger(DollarScriptletMatcher.class);
private static final class DollarScriptletMatcherOutput implements FsmOutput {
private final String entire;
private final String dollar;
private final String openingCurly;
private final String scriptlet;
private final String closingCurly;
public DollarScriptletMatcherOutput(
String entire,
String dollar,
String openingCurly,
String scriptlet,
String closingCurly
) {
this.entire = entire;
this.dollar = dollar;
this.openingCurly = openingCurly;
this.scriptlet = scriptlet;
this.closingCurly = closingCurly;
}
@Override
public String entire() {
return this.entire;
}
@Override
public String part(int index) {
return switch (index) {
case 1 -> this.dollar;
case 2 -> this.openingCurly;
case 3 -> this.scriptlet;
case 4 -> this.closingCurly;
default -> throw new IllegalArgumentException();
};
}
}
private enum State {
NO_STRING, G_STRING, SINGLE_QUOTE_STRING
}
private static final class Counter {
private int count = 0;
public void increment() {
this.count++;
}
public void decrement() {
this.count--;
}
public boolean isZero() {
return this.count == 0;
}
@Override
public String toString() {
return "Counter(" + this.count + ")";
}
}
private Deque<State> stateStack;
private Deque<Counter> counterStack;
private Counter getCurrentCounter() {
final var currentCounter = this.counterStack.peek();
if (currentCounter == null) {
throw new IllegalStateException("currentCounter is null");
}
return currentCounter;
}
@Override
public FsmOutput apply(String s) {
this.stateStack = new LinkedList<>();
this.counterStack = new LinkedList<>();
stateStack.push(State.NO_STRING);
counterStack.push(new Counter());
final Iterator<String> iterator = new Iterator<>() {
private int cur;
@Override
public boolean hasNext() {
return this.cur < s.length();
}
@Override
public String next() {
final var c = String.valueOf(s.charAt(this.cur));
this.cur++;
return c;
}
};
final var entireAcc = new StringBuilder();
if (!iterator.hasNext() || !iterator.next().equals("$")) {
return null;
} else {
entireAcc.append("$");
}
if (!iterator.hasNext() || !iterator.next().equals("{")) {
return null;
} else {
entireAcc.append("{");
this.getCurrentCounter().increment();
}
outer:
while (iterator.hasNext()) {
if (stateStack.isEmpty()) {
throw new IllegalStateException("stateStack is empty");
}
if (counterStack.isEmpty()) {
throw new IllegalStateException("counterStack is empty");
}
final var c0 = iterator.next();
entireAcc.append(c0);
logger.debug("----");
logger.debug("c0: {}", c0);
if (stateStack.peek() == State.NO_STRING) {
switch (c0) {
case "{" -> this.getCurrentCounter().increment();
case "}" -> {
final var currentCounter = this.getCurrentCounter();
currentCounter.decrement();
if (currentCounter.isZero()) {
if (this.counterStack.size() == 1) {
logger.debug("last Counter is zero; breaking while loop");
break outer;
} else {
logger.debug("counterStack.size() is greater than 1 and top Counter is zero; " +
"popping state and counter stacks.");
this.stateStack.pop();
this.counterStack.pop();
}
}
}
case "\"" -> this.stateStack.push(State.G_STRING);
case "'" -> this.stateStack.push(State.SINGLE_QUOTE_STRING);
}
} else if (stateStack.peek() == State.G_STRING) {
switch (c0) {
case "\\" -> {
if (iterator.hasNext()) {
final var c1 = iterator.next();
entireAcc.append(c1);
} else {
throw new IllegalArgumentException("Ill-formed dollarScriptlet (backslash followed by nothing)");
}
}
case "$" -> {
if (iterator.hasNext()) {
final var c1 = iterator.next();
entireAcc.append(c1);
if (c1.equals("{")) {
this.stateStack.push(State.NO_STRING);
this.counterStack.push(new Counter());
this.getCurrentCounter().increment();
}
} else {
throw new IllegalArgumentException("Ill-formed dollarScriptlet (ends with a dollar)");
}
}
case "\"" -> {
logger.debug("popping G_STRING state");
this.stateStack.pop();
}
}
} else if (stateStack.peek() == State.SINGLE_QUOTE_STRING) {
switch (c0) {
case "\\" -> {
if (iterator.hasNext()) {
entireAcc.append(iterator.next());
} else {
throw new IllegalArgumentException("Ill-formed dollarScriptlet (backslash followed by nothing)");
}
}
case "'" -> {
logger.debug("popping SINGLE_QUOTE_STRING state");
this.stateStack.pop();
}
}
} else {
throw new IllegalStateException("stateStack contains something which does not equal a state or is null");
}
logger.debug("entireAcc: {}", entireAcc);
logger.debug("stateStack: {}", stateStack);
logger.debug("counterStack: {}", counterStack);
}
return new DollarScriptletMatcherOutput(
entireAcc.toString(),
"$",
"{",
entireAcc.substring(2, entireAcc.length() - 1),
"}"
);
}
}

View File

@ -0,0 +1,6 @@
package com.jessebrault.gcp.tokenizer;
interface FsmOutput {
String entire();
String part(int index);
}

View File

@ -0,0 +1,23 @@
package com.jessebrault.gcp.tokenizer;
import java.util.regex.MatchResult;
public class MatchResultFsmOutput implements FsmOutput {
private final MatchResult matchResult;
public MatchResultFsmOutput(MatchResult matchResult) {
this.matchResult = matchResult;
}
@Override
public String entire() {
return this.matchResult.group();
}
@Override
public String part(int index) {
return this.matchResult.group(index);
}
}

View File

@ -0,0 +1,25 @@
package com.jessebrault.gcp.tokenizer;
import java.util.function.Function;
import java.util.regex.Pattern;
final class PatternMatcher implements Function<String, FsmOutput> {
private final Pattern pattern;
public PatternMatcher(Pattern pattern) {
this.pattern = pattern;
}
@Override
public FsmOutput apply(String s) {
final var m = this.pattern.matcher(s);
return m.find() ? new MatchResultFsmOutput(m) : null;
}
@Override
public String toString() {
return "MatcherFunction(" + this.pattern + ")";
}
}

View File

@ -0,0 +1,69 @@
package com.jessebrault.gcp.tokenizer;
import java.util.Collection;
public final class Token {
public enum Type {
LESS_THAN,
GREATER_THAN,
PERCENT,
EQUALS,
DOUBLE_QUOTE,
SINGLE_QUOTE,
FORWARD_SLASH,
IDENTIFIER,
CAPITALIZED_IDENTIFIER,
DOT,
WORD,
WHITESPACE,
NEWLINE,
STRING,
SCRIPTLET,
DOLLAR,
GROOVY_REFERENCE,
CURLY_OPEN,
CURLY_CLOSE
;
boolean isAnyOf(Collection<Type> types) {
return types.contains(this);
}
}
private final Type type;
private final String text;
private final int line;
private final int col;
public Token(Type type, String text, int line, int col) {
this.type = type;
this.text = text;
this.line = line;
this.col = col;
}
public Type getType() {
return type;
}
public String getText() {
return text;
}
public int getLine() {
return line;
}
public int getCol() {
return col;
}
@Override
public String toString() {
return String.format("Token(%s, %s, %d, %d)", this.type, this.text, this.line, this.col);
}
}

View File

@ -0,0 +1,82 @@
package com.jessebrault.gcp.tokenizer;
import java.util.LinkedList;
import java.util.Queue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public final class Tokenizer {
private static final Pattern lessThan = Pattern.compile("^<");
private static final Pattern greaterThan = Pattern.compile("^>");
private static final Pattern percent = Pattern.compile("^%");
private static final Pattern equals = Pattern.compile("^=");
private static final Pattern doubleQuote = Pattern.compile("^\"");
private static final Pattern singleQuote = Pattern.compile("^'");
private static final Pattern forwardSlash = Pattern.compile("^/");
private static final Pattern identifier = Pattern.compile("^[\\p{Ll}0-9_$][\\p{L}0-9_$]*");
private static final Pattern capitalizedIdentifier = Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*");
private static final Pattern dot = Pattern.compile("^\\.");
private static final Pattern word = Pattern.compile("^[\\w\\W&&[^\\s\\n\\r]]+");
private static final Pattern whitespace = Pattern.compile("^[\\s&&[^\n\r]]");
private static final Pattern newline = Pattern.compile("^[\\n\\r]");
public static Queue<Token> tokenize(final String gcpSrc) {
Queue<Token> tokens = new LinkedList<>();
var line = 1;
var col = 1;
String remaining = gcpSrc;
while (remaining.length() > 0) {
Matcher m;
if ((m = lessThan.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.LESS_THAN, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = greaterThan.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.GREATER_THAN, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = percent.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.PERCENT, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = equals.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.EQUALS, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = doubleQuote.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.DOUBLE_QUOTE, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = singleQuote.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.SINGLE_QUOTE, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = identifier.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.IDENTIFIER, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = forwardSlash.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.FORWARD_SLASH, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = capitalizedIdentifier.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.CAPITALIZED_IDENTIFIER, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = dot.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.DOT, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = word.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.WORD, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = whitespace.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.WHITESPACE, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = newline.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.NEWLINE, m.group(0), line, col));
col = 1;
line++;
}
remaining = remaining.substring(m.group(0).length());
}
return tokens;
}
}

View File

@ -0,0 +1,58 @@
package com.jessebrault.gcp.tokenizer;
import com.jessebrault.fsm.function.FunctionFsm;
import com.jessebrault.fsm.function.FunctionFsmBuilder;
import com.jessebrault.fsm.function.FunctionFsmBuilderImpl;
import static com.jessebrault.gcp.tokenizer.Token.Type.*;
import java.util.regex.Pattern;
class TokenizerFsm {
private static final PatternMatcher lessThan = new PatternMatcher(Pattern.compile("^<"));
private static final PatternMatcher greaterThan = new PatternMatcher(Pattern.compile("^>"));
private static final PatternMatcher percent = new PatternMatcher(Pattern.compile("^%"));
private static final PatternMatcher equals = new PatternMatcher(Pattern.compile("^="));
private static final PatternMatcher forwardSlash = new PatternMatcher(Pattern.compile("^/"));
private static final PatternMatcher identifier = new PatternMatcher(Pattern.compile("^[\\p{Ll}0-9_$][\\p{L}0-9_$]*"));
private static final PatternMatcher capitalizedIdentifier = new PatternMatcher(Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*"));
private static final PatternMatcher dot = new PatternMatcher(Pattern.compile("^\\."));
private static final PatternMatcher word = new PatternMatcher(Pattern.compile("^[\\w\\W&&[^\\s\\n\\r]]+"));
private static final PatternMatcher whitespace = new PatternMatcher(Pattern.compile("^[\\s&&[^\n\r]]"));
private static final PatternMatcher newline = new PatternMatcher(Pattern.compile("^[\\n\\r]"));
private static final PatternMatcher dollarReference = new PatternMatcher(Pattern.compile("^(\\$)([\\w$]+(?:\\.[\\w$]+)*)"));
private static final PatternMatcher openingComponentStart = new PatternMatcher(Pattern.compile("^<(?=\\p{Lu}|\\p{L}+(?:\\.\\p{L}+)+)"));
private static final PatternMatcher doubleQuote = new PatternMatcher(Pattern.compile("^\""));
private static final PatternMatcher singleQuote = new PatternMatcher(Pattern.compile("^'"));
private static final DollarScriptletMatcher dollarScriptlet = new DollarScriptletMatcher();
private static FunctionFsmBuilder<String, TokenizerState, FsmOutput> getFsmBuilder() {
return new FunctionFsmBuilderImpl<>();
}
public FunctionFsm<String, TokenizerState, FsmOutput> getFsm(Accumulator acc) {
return getFsmBuilder()
.whileIn(TokenizerState.NORMAL, sc -> {
sc.on(dollarReference).exec(r -> {
acc.accumulate(DOLLAR, r.part(1));
acc.accumulate(GROOVY_REFERENCE, r.part(2));
});
sc.on(dollarScriptlet).exec(o -> {
acc.accumulate(DOLLAR, o.part(1));
acc.accumulate(CURLY_OPEN, o.part(2));
acc.accumulate(SCRIPTLET, o.part(3));
acc.accumulate(CURLY_CLOSE, o.part(4));
});
sc.on(openingComponentStart).shiftTo(TokenizerState.COMPONENT).exec(r -> {
acc.accumulate(LESS_THAN, r.entire());
});
})
.build();
}
}

View File

@ -0,0 +1,8 @@
package com.jessebrault.gcp.tokenizer;
enum TokenizerState {
NORMAL,
COMPONENT,
G_STRING,
SCRIPTLET
}

View File

@ -0,0 +1,60 @@
package com.jessebrault.gcp.tokenizer
import org.junit.jupiter.api.Test
import static org.junit.jupiter.api.Assertions.assertEquals
class DollarScriptletMatcherTests {
private final DollarScriptletMatcher matcher = new DollarScriptletMatcher();
private void test(String expectedEntire, String input) {
def r = this.matcher.apply(input)
assertEquals(expectedEntire, r.entire())
assertEquals('$', r.part(1))
assertEquals('{', r.part(2))
assertEquals(expectedEntire.substring(2, expectedEntire.length() - 1), r.part(3))
assertEquals('}', r.part(4))
}
@Test
void empty() {
test '${}', '${}'
}
@Test
void simple() {
test '${ 1 + 2 }', '${ 1 + 2 }'
}
@Test
void nestedString() {
test '${ "myString" }', '${ "myString" }'
}
@Test
void nestedCurlyBraces() {
test '${ [1, 2, 3].collect { it + 1 }.size() }', '${ [1, 2, 3].collect { it + 1 }.size() }'
}
@Test
void nestedSingleQuoteString() {
test '${ \'abc\' }', '${ \'abc\' }'
}
@Test
void nestedGString() {
test '${ "abc" }', '${ "abc" }'
}
@Test
void nestedGStringWithClosure() {
test '${ "abc${ it }" }', '${ "abc${ it }" }'
}
@Test
void takesOnlyAsNeeded() {
test '${ 1 + 2 }', '${ 1 + 2 } someOther=${ 3 + 4 }'
}
}

View File

@ -0,0 +1,137 @@
package com.jessebrault.gcp.tokenizer
import org.junit.jupiter.api.Test
import org.slf4j.Logger
import org.slf4j.LoggerFactory
import static com.jessebrault.gcp.tokenizer.Token.Type.*
import static org.junit.jupiter.api.Assertions.assertEquals
import static org.junit.jupiter.api.Assertions.assertTrue
class TokenizerTests {
private static final Logger logger = LoggerFactory.getLogger(TokenizerTests)
private static class TokenSpec {
Token.Type type
String text
int line
int col
TokenSpec(Token.Type type, String text = null, line = 0, col = 0) {
this.type = Objects.requireNonNull(type)
this.text = text
this.line = line
this.col = col
}
void compare(Token actual) {
assertEquals(this.type, actual.type)
if (this.text != null) {
assertEquals(this.text, actual.text)
}
if (this.line != 0) {
assertEquals(this.line, actual.line)
}
if (this.col != 0) {
assertEquals(this.col, actual.col)
}
}
@Override
String toString() {
"TokenSpec(${ this.type }, ${ this.text }, ${ this.line }, ${ this.col })"
}
}
private static class TesterConfigurator {
Queue<TokenSpec> specs = new LinkedList<>()
void expect(Token.Type type, String text = null, line = 0, col = 0) {
this.specs << new TokenSpec(type, text, line, col)
}
}
private static void test(
String src,
@DelegatesTo(value = TesterConfigurator, strategy = Closure.DELEGATE_FIRST)
Closure<Void> configure
) {
def configurator = new TesterConfigurator()
configure.setDelegate(configurator)
configure.setResolveStrategy(Closure.DELEGATE_FIRST)
configure()
def r = Tokenizer.tokenize(src)
logger.debug('r: {}', r)
logger.debug('configurator.specs: {}', configurator.specs)
assertEquals(configurator.specs.size(), r.size())
def resultIterator = r.iterator()
configurator.specs.each {
assertTrue(resultIterator.hasNext())
it.compare(resultIterator.next())
}
}
@Test
void doctypeHtml() {
test('<!DOCTYPE html>') {
expect LESS_THAN, '<', 1, 1
expect WORD, '!DOCTYPE', 1, 2
expect WHITESPACE, ' ', 1, 10
expect IDENTIFIER, 'html', 1, 11
expect GREATER_THAN, '>', 1, 15
}
}
@Test
void htmlLangEn() {
test('<html lang="en">') {
expect LESS_THAN, '<', 1, 1
expect IDENTIFIER, 'html', 1, 2
expect WHITESPACE, ' ', 1, 6
expect IDENTIFIER, 'lang', 1, 7
expect EQUALS, '=', 1, 11
expect DOUBLE_QUOTE, '"', 1, 12
expect IDENTIFIER, 'en', 1, 13
expect DOUBLE_QUOTE, '"', 1, 15
expect GREATER_THAN, '>', 1, 16
}
}
@Test
void component() {
test('<Test />') {
expect LESS_THAN, '<', 1, 1
expect CAPITALIZED_IDENTIFIER, 'Test', 1, 2
expect WHITESPACE, ' ', 1, 6
expect FORWARD_SLASH, '/', 1, 7
expect GREATER_THAN, '>', 1, 8
}
}
@Test
void componentWithKeysAndValues() {
test('<Test test="test" />') {
expect LESS_THAN, '<', 1, 1
expect CAPITALIZED_IDENTIFIER, 'Test', 1, 2
expect WHITESPACE, ' ', 1, 6
expect IDENTIFIER, 'test', 1, 7
expect EQUALS, '=', 1, 11
expect DOUBLE_QUOTE, '"', 1, 12
expect STRING, 'test', 1, 13
expect DOUBLE_QUOTE, '"', 1, 17
expect WHITESPACE, ' ', 1, 18
expect FORWARD_SLASH, '/', 1, 19
expect GREATER_THAN, '>', 1, 20
}
}
}

View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8" ?>
<Configuration name="ssg" status="WARN">
<Appenders>
<Console name="standard" target="SYSTEM_OUT">
<PatternLayout>
<MarkerPatternSelector defaultPattern="%highlight{%-5level} %logger{1} %M %L: %msg%n%ex">
<PatternMatch key="FLOW" pattern="%highlight{%-5level} %logger{1} %M %L: %markerSimpleName %msg%n%ex" />
</MarkerPatternSelector>
</PatternLayout>
</Console>
</Appenders>
<Loggers>
<Root level="trace">
<AppenderRef ref="standard" />
</Root>
</Loggers>
</Configuration>