Tokenizer working, more tests to do.

This commit is contained in:
Jesse Brault 2023-01-25 13:54:23 +01:00
parent 368e63656e
commit 3fee229003
6 changed files with 220 additions and 138 deletions

View File

@ -2,21 +2,29 @@ package com.jessebrault.gcp.tokenizer;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Queue; import java.util.Queue;
import java.util.regex.Pattern;
final class Accumulator { final class Accumulator {
Queue<Token> tokens = new LinkedList<>(); private static final Pattern newline = Pattern.compile("([\n\r])");
int line = 1;
int col = 1; private final Queue<Token> tokens = new LinkedList<>();
private int line = 1;
private int col = 1;
public void accumulate(Token.Type type, String text) { public void accumulate(Token.Type type, String text) {
this.tokens.add(new Token(type, text, this.line, this.col)); this.tokens.add(new Token(type, text, this.line, this.col));
if (type == Token.Type.NEWLINE) { final var m = newline.matcher(text);
this.line++; if (m.find()) {
this.line += m.groupCount();
this.col = 1; this.col = 1;
} else { } else {
this.col += text.length(); this.col += text.length();
} }
} }
public Queue<Token> getTokens() {
return this.tokens;
}
} }

View File

@ -5,26 +5,33 @@ import java.util.Collection;
public final class Token { public final class Token {
public enum Type { public enum Type {
LESS_THAN, TEXT,
GREATER_THAN,
PERCENT,
EQUALS,
DOUBLE_QUOTE,
SINGLE_QUOTE,
FORWARD_SLASH,
IDENTIFIER,
CAPITALIZED_IDENTIFIER,
DOT,
WORD,
WHITESPACE,
NEWLINE,
STRING,
SCRIPTLET,
DOLLAR, DOLLAR,
GROOVY_REFERENCE, GROOVY_REFERENCE,
CURLY_OPEN, CURLY_OPEN,
CURLY_CLOSE SCRIPTLET,
CURLY_CLOSE,
BLOCK_SCRIPTLET_OPEN,
EXPRESSION_SCRIPTLET_OPEN,
SCRIPTLET_CLOSE,
CLASS_NAME,
PACKAGE_NAME,
DOT,
WHITESPACE,
KEY,
EQUALS,
DOUBLE_QUOTE,
STRING,
SINGLE_QUOTE,
COMPONENT_START,
FORWARD_SLASH,
COMPONENT_END,
; ;
boolean isAnyOf(Collection<Type> types) { boolean isAnyOf(Collection<Type> types) {

View File

@ -1,82 +1,23 @@
package com.jessebrault.gcp.tokenizer; package com.jessebrault.gcp.tokenizer;
import java.util.LinkedList;
import java.util.Queue; import java.util.Queue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public final class Tokenizer { public final class Tokenizer {
private static final Pattern lessThan = Pattern.compile("^<");
private static final Pattern greaterThan = Pattern.compile("^>");
private static final Pattern percent = Pattern.compile("^%");
private static final Pattern equals = Pattern.compile("^=");
private static final Pattern doubleQuote = Pattern.compile("^\"");
private static final Pattern singleQuote = Pattern.compile("^'");
private static final Pattern forwardSlash = Pattern.compile("^/");
private static final Pattern identifier = Pattern.compile("^[\\p{Ll}0-9_$][\\p{L}0-9_$]*");
private static final Pattern capitalizedIdentifier = Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*");
private static final Pattern dot = Pattern.compile("^\\.");
private static final Pattern word = Pattern.compile("^[\\w\\W&&[^\\s\\n\\r]]+");
private static final Pattern whitespace = Pattern.compile("^[\\s&&[^\n\r]]");
private static final Pattern newline = Pattern.compile("^[\\n\\r]");
public static Queue<Token> tokenize(final String gcpSrc) { public static Queue<Token> tokenize(final String gcpSrc) {
Queue<Token> tokens = new LinkedList<>(); final var acc = new Accumulator();
final var fsm = TokenizerFsm.get(acc);
var line = 1;
var col = 1;
String remaining = gcpSrc; String remaining = gcpSrc;
while (remaining.length() > 0) { while (remaining.length() > 0) {
Matcher m; final var o = fsm.apply(remaining);
if (o == null) {
if ((m = lessThan.matcher(remaining)).find()) { throw new IllegalStateException();
tokens.add(new Token(Token.Type.LESS_THAN, m.group(0), line, col)); }
col += m.group(0).length(); remaining = remaining.substring(o.entire().length());
} else if ((m = greaterThan.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.GREATER_THAN, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = percent.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.PERCENT, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = equals.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.EQUALS, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = doubleQuote.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.DOUBLE_QUOTE, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = singleQuote.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.SINGLE_QUOTE, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = identifier.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.IDENTIFIER, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = forwardSlash.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.FORWARD_SLASH, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = capitalizedIdentifier.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.CAPITALIZED_IDENTIFIER, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = dot.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.DOT, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = word.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.WORD, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = whitespace.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.WHITESPACE, m.group(0), line, col));
col += m.group(0).length();
} else if ((m = newline.matcher(remaining)).find()) {
tokens.add(new Token(Token.Type.NEWLINE, m.group(0), line, col));
col = 1;
line++;
} }
remaining = remaining.substring(m.group(0).length()); return acc.getTokens();
}
return tokens;
} }
} }

View File

@ -8,39 +8,85 @@ import static com.jessebrault.gcp.tokenizer.Token.Type.*;
import java.util.regex.Pattern; import java.util.regex.Pattern;
class TokenizerFsm { final class TokenizerFsm {
private static final PatternMatcher lessThan = new PatternMatcher(Pattern.compile("^<")); /**
private static final PatternMatcher greaterThan = new PatternMatcher(Pattern.compile("^>")); * Text
private static final PatternMatcher percent = new PatternMatcher(Pattern.compile("^%")); */
private static final PatternMatcher equals = new PatternMatcher(Pattern.compile("^=")); private static final PatternMatcher text = new PatternMatcher(
private static final PatternMatcher forwardSlash = new PatternMatcher(Pattern.compile("^/")); Pattern.compile("^(?:[\\w\\W&&[^<$]]|<(?!%|/?\\p{Lu}|/?[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)|\\$(?![\\w$]+(?:\\.[\\w$]+)*))+")
private static final PatternMatcher identifier = new PatternMatcher(Pattern.compile("^[\\p{Ll}0-9_$][\\p{L}0-9_$]*")); );
private static final PatternMatcher capitalizedIdentifier = new PatternMatcher(Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*"));
private static final PatternMatcher dot = new PatternMatcher(Pattern.compile("^\\."));
private static final PatternMatcher word = new PatternMatcher(Pattern.compile("^[\\w\\W&&[^\\s\\n\\r]]+"));
private static final PatternMatcher whitespace = new PatternMatcher(Pattern.compile("^[\\s&&[^\n\r]]"));
private static final PatternMatcher newline = new PatternMatcher(Pattern.compile("^[\\n\\r]"));
private static final PatternMatcher dollarReference = new PatternMatcher(Pattern.compile("^(\\$)([\\w$]+(?:\\.[\\w$]+)*)"));
private static final PatternMatcher openingComponentStart = new PatternMatcher(Pattern.compile("^<(?=\\p{Lu}|\\p{L}+(?:\\.\\p{L}+)+)"));
private static final PatternMatcher doubleQuote = new PatternMatcher(Pattern.compile("^\""));
private static final PatternMatcher singleQuote = new PatternMatcher(Pattern.compile("^'"));
/**
* Gsp dollar reference and scriptlets, also used as component values
*/
private static final PatternMatcher dollarReference = new PatternMatcher(
Pattern.compile("^(\\$)([\\w$]+(?:\\.[\\w$]+)*)")
);
private static final DollarScriptletMatcher dollarScriptlet = new DollarScriptletMatcher(); private static final DollarScriptletMatcher dollarScriptlet = new DollarScriptletMatcher();
private static final PatternMatcher blockScriptlet = new PatternMatcher(
Pattern.compile("^(<%)(.*?)(%>)")
);
private static final PatternMatcher expressionScriptlet = new PatternMatcher(
Pattern.compile("^(<%=)(.*?)(%>)")
);
/**
* Component starts
*/
private static final PatternMatcher openingComponentStart = new PatternMatcher(
Pattern.compile("^<(?=\\p{Lu}|[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)")
);
private static final PatternMatcher closingComponentStart = new PatternMatcher(
Pattern.compile("^(<)(/)(?=\\p{Lu}|[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)")
);
/**
* Component names
*/
private static final PatternMatcher className = new PatternMatcher(
Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*")
);
private static final PatternMatcher packageName = new PatternMatcher(
Pattern.compile("^[\\p{L}0-9_$]+(?=\\.)")
);
private static final PatternMatcher dot = new PatternMatcher(
Pattern.compile("^\\.")
);
/**
* Whitespace
*/
private static final PatternMatcher whitespace = new PatternMatcher(Pattern.compile("^[\\s&&[^\n\r]]+"));
/**
* Keys and values
*/
private static final PatternMatcher key = new PatternMatcher(
Pattern.compile("^[\\p{L}0-9_$]+")
);
private static final PatternMatcher equals = new PatternMatcher(Pattern.compile("^="));
/**
* Component ends
*/
private static final PatternMatcher forwardSlash = new PatternMatcher(Pattern.compile("^/"));
private static final PatternMatcher componentEnd = new PatternMatcher(Pattern.compile("^>"));
private static FunctionFsmBuilder<String, TokenizerState, FsmOutput> getFsmBuilder() { private static FunctionFsmBuilder<String, TokenizerState, FsmOutput> getFsmBuilder() {
return new FunctionFsmBuilderImpl<>(); return new FunctionFsmBuilderImpl<>();
} }
public FunctionFsm<String, TokenizerState, FsmOutput> getFsm(Accumulator acc) { public static FunctionFsm<String, TokenizerState, FsmOutput> get(Accumulator acc) {
return getFsmBuilder() return getFsmBuilder()
.setInitialState(TokenizerState.NORMAL)
.whileIn(TokenizerState.NORMAL, sc -> { .whileIn(TokenizerState.NORMAL, sc -> {
sc.on(dollarReference).exec(r -> { sc.on(text).exec(o -> {
acc.accumulate(DOLLAR, r.part(1)); acc.accumulate(TEXT, o.entire());
acc.accumulate(GROOVY_REFERENCE, r.part(2)); });
sc.on(dollarReference).exec(o -> {
acc.accumulate(DOLLAR, o.part(1));
acc.accumulate(GROOVY_REFERENCE, o.part(2));
}); });
sc.on(dollarScriptlet).exec(o -> { sc.on(dollarScriptlet).exec(o -> {
acc.accumulate(DOLLAR, o.part(1)); acc.accumulate(DOLLAR, o.part(1));
@ -48,11 +94,92 @@ class TokenizerFsm {
acc.accumulate(SCRIPTLET, o.part(3)); acc.accumulate(SCRIPTLET, o.part(3));
acc.accumulate(CURLY_CLOSE, o.part(4)); acc.accumulate(CURLY_CLOSE, o.part(4));
}); });
sc.on(openingComponentStart).shiftTo(TokenizerState.COMPONENT).exec(r -> { sc.on(blockScriptlet).exec(o -> {
acc.accumulate(LESS_THAN, r.entire()); acc.accumulate(BLOCK_SCRIPTLET_OPEN, o.part(1));
acc.accumulate(SCRIPTLET, o.part(2));
acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
}); });
sc.on(expressionScriptlet).exec(o -> {
acc.accumulate(EXPRESSION_SCRIPTLET_OPEN, o.part(1));
acc.accumulate(SCRIPTLET, o.part(2));
acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
});
sc.on(openingComponentStart).shiftTo(TokenizerState.COMPONENT_NAME).exec(o ->
acc.accumulate(COMPONENT_START, o.entire())
);
sc.on(closingComponentStart).shiftTo(TokenizerState.COMPONENT_NAME).exec(o -> {
acc.accumulate(COMPONENT_START, o.part(1));
acc.accumulate(FORWARD_SLASH, o.part(2));
});
sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
})
.whileIn(TokenizerState.COMPONENT_NAME, sc -> {
sc.on(packageName).exec(o -> {
acc.accumulate(PACKAGE_NAME, o.entire());
});
sc.on(dot).exec(o -> {
acc.accumulate(DOT, o.entire());
});
sc.on(className).exec(o -> {
acc.accumulate(CLASS_NAME, o.entire());
});
sc.on(forwardSlash).exec(o -> {
acc.accumulate(FORWARD_SLASH, o.entire());
});
sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
acc.accumulate(COMPONENT_END, o.entire());
});
sc.on(whitespace).shiftTo(TokenizerState.COMPONENT_KEYS_AND_VALUES).exec(o -> {
acc.accumulate(WHITESPACE, o.entire());
});
sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
})
.whileIn(TokenizerState.COMPONENT_KEYS_AND_VALUES, sc -> {
sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
acc.accumulate(COMPONENT_END, o.entire());
});
sc.on(whitespace).exec(o -> {
acc.accumulate(WHITESPACE, o.entire());
});
sc.on(key).exec(o -> {
acc.accumulate(KEY, o.entire());
});
sc.on(equals).exec(o -> {
acc.accumulate(EQUALS, o.entire());
});
// sc.on(gString)
// sc.on(singleQuoteString)
sc.on(dollarReference).exec(o -> {
acc.accumulate(DOLLAR, o.part(1));
acc.accumulate(GROOVY_REFERENCE, o.part(2));
});
sc.on(dollarScriptlet).exec(o -> {
acc.accumulate(DOLLAR, o.part(1));
acc.accumulate(CURLY_OPEN, o.part(2));
acc.accumulate(SCRIPTLET, o.part(3));
acc.accumulate(CURLY_CLOSE, o.part(4));
});
sc.on(blockScriptlet).exec(o -> {
acc.accumulate(BLOCK_SCRIPTLET_OPEN, o.part(1));
acc.accumulate(SCRIPTLET, o.part(2));
acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
});
sc.on(expressionScriptlet).exec(o -> {
acc.accumulate(EXPRESSION_SCRIPTLET_OPEN, o.part(1));
acc.accumulate(SCRIPTLET, o.part(2));
acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
});
sc.on(forwardSlash).exec(o -> {
acc.accumulate(FORWARD_SLASH, o.entire());
});
sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
acc.accumulate(COMPONENT_END, o.entire());
});
sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
}) })
.build(); .build();
} }
} }

View File

@ -2,7 +2,6 @@ package com.jessebrault.gcp.tokenizer;
enum TokenizerState { enum TokenizerState {
NORMAL, NORMAL,
COMPONENT, COMPONENT_NAME,
G_STRING, COMPONENT_KEYS_AND_VALUES
SCRIPTLET
} }

View File

@ -1,5 +1,6 @@
package com.jessebrault.gcp.tokenizer package com.jessebrault.gcp.tokenizer
import org.junit.jupiter.api.Disabled
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
import org.slf4j.Logger import org.slf4j.Logger
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
@ -81,56 +82,55 @@ class TokenizerTests {
} }
@Test @Test
void doctypeHtml() { void doctypeHtmlIsText() {
test('<!DOCTYPE html>') { test('<!DOCTYPE html>') {
expect LESS_THAN, '<', 1, 1 expect TEXT, '<!DOCTYPE html>', 1, 1
expect WORD, '!DOCTYPE', 1, 2
expect WHITESPACE, ' ', 1, 10
expect IDENTIFIER, 'html', 1, 11
expect GREATER_THAN, '>', 1, 15
} }
} }
@Test @Test
void htmlLangEn() { void htmlLangEnIsText() {
test('<html lang="en">') { test('<html lang="en">') {
expect LESS_THAN, '<', 1, 1 expect TEXT, '<html lang="en">', 1, 1
expect IDENTIFIER, 'html', 1, 2
expect WHITESPACE, ' ', 1, 6
expect IDENTIFIER, 'lang', 1, 7
expect EQUALS, '=', 1, 11
expect DOUBLE_QUOTE, '"', 1, 12
expect IDENTIFIER, 'en', 1, 13
expect DOUBLE_QUOTE, '"', 1, 15
expect GREATER_THAN, '>', 1, 16
} }
} }
@Test @Test
void component() { void component() {
test('<Test />') { test('<Test />') {
expect LESS_THAN, '<', 1, 1 expect COMPONENT_START, '<', 1, 1
expect CAPITALIZED_IDENTIFIER, 'Test', 1, 2 expect CLASS_NAME, 'Test', 1, 2
expect WHITESPACE, ' ', 1, 6 expect WHITESPACE, ' ', 1, 6
expect FORWARD_SLASH, '/', 1, 7 expect FORWARD_SLASH, '/', 1, 7
expect GREATER_THAN, '>', 1, 8 expect COMPONENT_END, '>', 1, 8
} }
} }
@Test @Test
@Disabled
void componentWithKeysAndValues() { void componentWithKeysAndValues() {
test('<Test test="test" />') { test('<Test test="test" />') {
expect LESS_THAN, '<', 1, 1 expect COMPONENT_START, '<', 1, 1
expect CAPITALIZED_IDENTIFIER, 'Test', 1, 2 expect CLASS_NAME, 'Test', 1, 2
expect WHITESPACE, ' ', 1, 6 expect WHITESPACE, ' ', 1, 6
expect IDENTIFIER, 'test', 1, 7 expect KEY, 'test', 1, 7
expect EQUALS, '=', 1, 11 expect EQUALS, '=', 1, 11
expect DOUBLE_QUOTE, '"', 1, 12 expect DOUBLE_QUOTE, '"', 1, 12
expect STRING, 'test', 1, 13 expect STRING, 'test', 1, 13
expect DOUBLE_QUOTE, '"', 1, 17 expect DOUBLE_QUOTE, '"', 1, 17
expect WHITESPACE, ' ', 1, 18 expect WHITESPACE, ' ', 1, 18
expect FORWARD_SLASH, '/', 1, 19 expect FORWARD_SLASH, '/', 1, 19
expect GREATER_THAN, '>', 1, 20 expect COMPONENT_END, '>', 1, 20
}
}
@Test
void newlinesCounted() {
test('Hello,\n$person!') {
expect TEXT, 'Hello,\n', 1, 1
expect DOLLAR, '$', 2, 1
expect GROOVY_REFERENCE, 'person', 2, 2
expect TEXT, '!', 2, 8
} }
} }