Tokenizer working, more tests to do.
This commit is contained in:
parent
368e63656e
commit
3fee229003
@ -2,21 +2,29 @@ package com.jessebrault.gcp.tokenizer;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.Queue;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
final class Accumulator {
|
||||
|
||||
Queue<Token> tokens = new LinkedList<>();
|
||||
int line = 1;
|
||||
int col = 1;
|
||||
private static final Pattern newline = Pattern.compile("([\n\r])");
|
||||
|
||||
private final Queue<Token> tokens = new LinkedList<>();
|
||||
private int line = 1;
|
||||
private int col = 1;
|
||||
|
||||
public void accumulate(Token.Type type, String text) {
|
||||
this.tokens.add(new Token(type, text, this.line, this.col));
|
||||
if (type == Token.Type.NEWLINE) {
|
||||
this.line++;
|
||||
final var m = newline.matcher(text);
|
||||
if (m.find()) {
|
||||
this.line += m.groupCount();
|
||||
this.col = 1;
|
||||
} else {
|
||||
this.col += text.length();
|
||||
}
|
||||
}
|
||||
|
||||
public Queue<Token> getTokens() {
|
||||
return this.tokens;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,26 +5,33 @@ import java.util.Collection;
|
||||
public final class Token {
|
||||
|
||||
public enum Type {
|
||||
LESS_THAN,
|
||||
GREATER_THAN,
|
||||
PERCENT,
|
||||
EQUALS,
|
||||
DOUBLE_QUOTE,
|
||||
SINGLE_QUOTE,
|
||||
FORWARD_SLASH,
|
||||
IDENTIFIER,
|
||||
CAPITALIZED_IDENTIFIER,
|
||||
DOT,
|
||||
WORD,
|
||||
WHITESPACE,
|
||||
NEWLINE,
|
||||
STRING,
|
||||
SCRIPTLET,
|
||||
TEXT,
|
||||
|
||||
DOLLAR,
|
||||
GROOVY_REFERENCE,
|
||||
CURLY_OPEN,
|
||||
CURLY_CLOSE
|
||||
SCRIPTLET,
|
||||
CURLY_CLOSE,
|
||||
BLOCK_SCRIPTLET_OPEN,
|
||||
EXPRESSION_SCRIPTLET_OPEN,
|
||||
SCRIPTLET_CLOSE,
|
||||
|
||||
CLASS_NAME,
|
||||
PACKAGE_NAME,
|
||||
DOT,
|
||||
|
||||
WHITESPACE,
|
||||
|
||||
KEY,
|
||||
EQUALS,
|
||||
|
||||
DOUBLE_QUOTE,
|
||||
STRING,
|
||||
SINGLE_QUOTE,
|
||||
|
||||
COMPONENT_START,
|
||||
FORWARD_SLASH,
|
||||
COMPONENT_END,
|
||||
;
|
||||
|
||||
boolean isAnyOf(Collection<Type> types) {
|
||||
|
@ -1,82 +1,23 @@
|
||||
package com.jessebrault.gcp.tokenizer;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.Queue;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public final class Tokenizer {
|
||||
|
||||
private static final Pattern lessThan = Pattern.compile("^<");
|
||||
private static final Pattern greaterThan = Pattern.compile("^>");
|
||||
private static final Pattern percent = Pattern.compile("^%");
|
||||
private static final Pattern equals = Pattern.compile("^=");
|
||||
private static final Pattern doubleQuote = Pattern.compile("^\"");
|
||||
private static final Pattern singleQuote = Pattern.compile("^'");
|
||||
private static final Pattern forwardSlash = Pattern.compile("^/");
|
||||
private static final Pattern identifier = Pattern.compile("^[\\p{Ll}0-9_$][\\p{L}0-9_$]*");
|
||||
private static final Pattern capitalizedIdentifier = Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*");
|
||||
private static final Pattern dot = Pattern.compile("^\\.");
|
||||
private static final Pattern word = Pattern.compile("^[\\w\\W&&[^\\s\\n\\r]]+");
|
||||
private static final Pattern whitespace = Pattern.compile("^[\\s&&[^\n\r]]");
|
||||
private static final Pattern newline = Pattern.compile("^[\\n\\r]");
|
||||
|
||||
public static Queue<Token> tokenize(final String gcpSrc) {
|
||||
Queue<Token> tokens = new LinkedList<>();
|
||||
final var acc = new Accumulator();
|
||||
final var fsm = TokenizerFsm.get(acc);
|
||||
|
||||
var line = 1;
|
||||
var col = 1;
|
||||
|
||||
String remaining = gcpSrc;
|
||||
while (remaining.length() > 0) {
|
||||
Matcher m;
|
||||
|
||||
if ((m = lessThan.matcher(remaining)).find()) {
|
||||
tokens.add(new Token(Token.Type.LESS_THAN, m.group(0), line, col));
|
||||
col += m.group(0).length();
|
||||
} else if ((m = greaterThan.matcher(remaining)).find()) {
|
||||
tokens.add(new Token(Token.Type.GREATER_THAN, m.group(0), line, col));
|
||||
col += m.group(0).length();
|
||||
} else if ((m = percent.matcher(remaining)).find()) {
|
||||
tokens.add(new Token(Token.Type.PERCENT, m.group(0), line, col));
|
||||
col += m.group(0).length();
|
||||
} else if ((m = equals.matcher(remaining)).find()) {
|
||||
tokens.add(new Token(Token.Type.EQUALS, m.group(0), line, col));
|
||||
col += m.group(0).length();
|
||||
} else if ((m = doubleQuote.matcher(remaining)).find()) {
|
||||
tokens.add(new Token(Token.Type.DOUBLE_QUOTE, m.group(0), line, col));
|
||||
col += m.group(0).length();
|
||||
} else if ((m = singleQuote.matcher(remaining)).find()) {
|
||||
tokens.add(new Token(Token.Type.SINGLE_QUOTE, m.group(0), line, col));
|
||||
col += m.group(0).length();
|
||||
} else if ((m = identifier.matcher(remaining)).find()) {
|
||||
tokens.add(new Token(Token.Type.IDENTIFIER, m.group(0), line, col));
|
||||
col += m.group(0).length();
|
||||
} else if ((m = forwardSlash.matcher(remaining)).find()) {
|
||||
tokens.add(new Token(Token.Type.FORWARD_SLASH, m.group(0), line, col));
|
||||
col += m.group(0).length();
|
||||
} else if ((m = capitalizedIdentifier.matcher(remaining)).find()) {
|
||||
tokens.add(new Token(Token.Type.CAPITALIZED_IDENTIFIER, m.group(0), line, col));
|
||||
col += m.group(0).length();
|
||||
} else if ((m = dot.matcher(remaining)).find()) {
|
||||
tokens.add(new Token(Token.Type.DOT, m.group(0), line, col));
|
||||
col += m.group(0).length();
|
||||
} else if ((m = word.matcher(remaining)).find()) {
|
||||
tokens.add(new Token(Token.Type.WORD, m.group(0), line, col));
|
||||
col += m.group(0).length();
|
||||
} else if ((m = whitespace.matcher(remaining)).find()) {
|
||||
tokens.add(new Token(Token.Type.WHITESPACE, m.group(0), line, col));
|
||||
col += m.group(0).length();
|
||||
} else if ((m = newline.matcher(remaining)).find()) {
|
||||
tokens.add(new Token(Token.Type.NEWLINE, m.group(0), line, col));
|
||||
col = 1;
|
||||
line++;
|
||||
final var o = fsm.apply(remaining);
|
||||
if (o == null) {
|
||||
throw new IllegalStateException();
|
||||
}
|
||||
|
||||
remaining = remaining.substring(m.group(0).length());
|
||||
remaining = remaining.substring(o.entire().length());
|
||||
}
|
||||
|
||||
return tokens;
|
||||
return acc.getTokens();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -8,39 +8,85 @@ import static com.jessebrault.gcp.tokenizer.Token.Type.*;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
class TokenizerFsm {
|
||||
final class TokenizerFsm {
|
||||
|
||||
private static final PatternMatcher lessThan = new PatternMatcher(Pattern.compile("^<"));
|
||||
private static final PatternMatcher greaterThan = new PatternMatcher(Pattern.compile("^>"));
|
||||
private static final PatternMatcher percent = new PatternMatcher(Pattern.compile("^%"));
|
||||
private static final PatternMatcher equals = new PatternMatcher(Pattern.compile("^="));
|
||||
private static final PatternMatcher forwardSlash = new PatternMatcher(Pattern.compile("^/"));
|
||||
private static final PatternMatcher identifier = new PatternMatcher(Pattern.compile("^[\\p{Ll}0-9_$][\\p{L}0-9_$]*"));
|
||||
private static final PatternMatcher capitalizedIdentifier = new PatternMatcher(Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*"));
|
||||
private static final PatternMatcher dot = new PatternMatcher(Pattern.compile("^\\."));
|
||||
private static final PatternMatcher word = new PatternMatcher(Pattern.compile("^[\\w\\W&&[^\\s\\n\\r]]+"));
|
||||
private static final PatternMatcher whitespace = new PatternMatcher(Pattern.compile("^[\\s&&[^\n\r]]"));
|
||||
private static final PatternMatcher newline = new PatternMatcher(Pattern.compile("^[\\n\\r]"));
|
||||
|
||||
private static final PatternMatcher dollarReference = new PatternMatcher(Pattern.compile("^(\\$)([\\w$]+(?:\\.[\\w$]+)*)"));
|
||||
|
||||
private static final PatternMatcher openingComponentStart = new PatternMatcher(Pattern.compile("^<(?=\\p{Lu}|\\p{L}+(?:\\.\\p{L}+)+)"));
|
||||
|
||||
private static final PatternMatcher doubleQuote = new PatternMatcher(Pattern.compile("^\""));
|
||||
private static final PatternMatcher singleQuote = new PatternMatcher(Pattern.compile("^'"));
|
||||
/**
|
||||
* Text
|
||||
*/
|
||||
private static final PatternMatcher text = new PatternMatcher(
|
||||
Pattern.compile("^(?:[\\w\\W&&[^<$]]|<(?!%|/?\\p{Lu}|/?[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)|\\$(?![\\w$]+(?:\\.[\\w$]+)*))+")
|
||||
);
|
||||
|
||||
/**
|
||||
* Gsp dollar reference and scriptlets, also used as component values
|
||||
*/
|
||||
private static final PatternMatcher dollarReference = new PatternMatcher(
|
||||
Pattern.compile("^(\\$)([\\w$]+(?:\\.[\\w$]+)*)")
|
||||
);
|
||||
private static final DollarScriptletMatcher dollarScriptlet = new DollarScriptletMatcher();
|
||||
private static final PatternMatcher blockScriptlet = new PatternMatcher(
|
||||
Pattern.compile("^(<%)(.*?)(%>)")
|
||||
);
|
||||
private static final PatternMatcher expressionScriptlet = new PatternMatcher(
|
||||
Pattern.compile("^(<%=)(.*?)(%>)")
|
||||
);
|
||||
|
||||
/**
|
||||
* Component starts
|
||||
*/
|
||||
private static final PatternMatcher openingComponentStart = new PatternMatcher(
|
||||
Pattern.compile("^<(?=\\p{Lu}|[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)")
|
||||
);
|
||||
private static final PatternMatcher closingComponentStart = new PatternMatcher(
|
||||
Pattern.compile("^(<)(/)(?=\\p{Lu}|[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)")
|
||||
);
|
||||
|
||||
/**
|
||||
* Component names
|
||||
*/
|
||||
private static final PatternMatcher className = new PatternMatcher(
|
||||
Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*")
|
||||
);
|
||||
private static final PatternMatcher packageName = new PatternMatcher(
|
||||
Pattern.compile("^[\\p{L}0-9_$]+(?=\\.)")
|
||||
);
|
||||
private static final PatternMatcher dot = new PatternMatcher(
|
||||
Pattern.compile("^\\.")
|
||||
);
|
||||
|
||||
/**
|
||||
* Whitespace
|
||||
*/
|
||||
private static final PatternMatcher whitespace = new PatternMatcher(Pattern.compile("^[\\s&&[^\n\r]]+"));
|
||||
|
||||
/**
|
||||
* Keys and values
|
||||
*/
|
||||
private static final PatternMatcher key = new PatternMatcher(
|
||||
Pattern.compile("^[\\p{L}0-9_$]+")
|
||||
);
|
||||
private static final PatternMatcher equals = new PatternMatcher(Pattern.compile("^="));
|
||||
|
||||
/**
|
||||
* Component ends
|
||||
*/
|
||||
private static final PatternMatcher forwardSlash = new PatternMatcher(Pattern.compile("^/"));
|
||||
private static final PatternMatcher componentEnd = new PatternMatcher(Pattern.compile("^>"));
|
||||
|
||||
private static FunctionFsmBuilder<String, TokenizerState, FsmOutput> getFsmBuilder() {
|
||||
return new FunctionFsmBuilderImpl<>();
|
||||
}
|
||||
|
||||
public FunctionFsm<String, TokenizerState, FsmOutput> getFsm(Accumulator acc) {
|
||||
public static FunctionFsm<String, TokenizerState, FsmOutput> get(Accumulator acc) {
|
||||
return getFsmBuilder()
|
||||
.setInitialState(TokenizerState.NORMAL)
|
||||
.whileIn(TokenizerState.NORMAL, sc -> {
|
||||
sc.on(dollarReference).exec(r -> {
|
||||
acc.accumulate(DOLLAR, r.part(1));
|
||||
acc.accumulate(GROOVY_REFERENCE, r.part(2));
|
||||
sc.on(text).exec(o -> {
|
||||
acc.accumulate(TEXT, o.entire());
|
||||
});
|
||||
sc.on(dollarReference).exec(o -> {
|
||||
acc.accumulate(DOLLAR, o.part(1));
|
||||
acc.accumulate(GROOVY_REFERENCE, o.part(2));
|
||||
});
|
||||
sc.on(dollarScriptlet).exec(o -> {
|
||||
acc.accumulate(DOLLAR, o.part(1));
|
||||
@ -48,11 +94,92 @@ class TokenizerFsm {
|
||||
acc.accumulate(SCRIPTLET, o.part(3));
|
||||
acc.accumulate(CURLY_CLOSE, o.part(4));
|
||||
});
|
||||
sc.on(openingComponentStart).shiftTo(TokenizerState.COMPONENT).exec(r -> {
|
||||
acc.accumulate(LESS_THAN, r.entire());
|
||||
sc.on(blockScriptlet).exec(o -> {
|
||||
acc.accumulate(BLOCK_SCRIPTLET_OPEN, o.part(1));
|
||||
acc.accumulate(SCRIPTLET, o.part(2));
|
||||
acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
|
||||
});
|
||||
sc.on(expressionScriptlet).exec(o -> {
|
||||
acc.accumulate(EXPRESSION_SCRIPTLET_OPEN, o.part(1));
|
||||
acc.accumulate(SCRIPTLET, o.part(2));
|
||||
acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
|
||||
});
|
||||
sc.on(openingComponentStart).shiftTo(TokenizerState.COMPONENT_NAME).exec(o ->
|
||||
acc.accumulate(COMPONENT_START, o.entire())
|
||||
);
|
||||
sc.on(closingComponentStart).shiftTo(TokenizerState.COMPONENT_NAME).exec(o -> {
|
||||
acc.accumulate(COMPONENT_START, o.part(1));
|
||||
acc.accumulate(FORWARD_SLASH, o.part(2));
|
||||
});
|
||||
sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
|
||||
})
|
||||
.whileIn(TokenizerState.COMPONENT_NAME, sc -> {
|
||||
sc.on(packageName).exec(o -> {
|
||||
acc.accumulate(PACKAGE_NAME, o.entire());
|
||||
});
|
||||
sc.on(dot).exec(o -> {
|
||||
acc.accumulate(DOT, o.entire());
|
||||
});
|
||||
sc.on(className).exec(o -> {
|
||||
acc.accumulate(CLASS_NAME, o.entire());
|
||||
});
|
||||
sc.on(forwardSlash).exec(o -> {
|
||||
acc.accumulate(FORWARD_SLASH, o.entire());
|
||||
});
|
||||
sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
|
||||
acc.accumulate(COMPONENT_END, o.entire());
|
||||
});
|
||||
sc.on(whitespace).shiftTo(TokenizerState.COMPONENT_KEYS_AND_VALUES).exec(o -> {
|
||||
acc.accumulate(WHITESPACE, o.entire());
|
||||
});
|
||||
sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
|
||||
})
|
||||
.whileIn(TokenizerState.COMPONENT_KEYS_AND_VALUES, sc -> {
|
||||
sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
|
||||
acc.accumulate(COMPONENT_END, o.entire());
|
||||
});
|
||||
sc.on(whitespace).exec(o -> {
|
||||
acc.accumulate(WHITESPACE, o.entire());
|
||||
});
|
||||
sc.on(key).exec(o -> {
|
||||
acc.accumulate(KEY, o.entire());
|
||||
});
|
||||
sc.on(equals).exec(o -> {
|
||||
acc.accumulate(EQUALS, o.entire());
|
||||
});
|
||||
// sc.on(gString)
|
||||
// sc.on(singleQuoteString)
|
||||
sc.on(dollarReference).exec(o -> {
|
||||
acc.accumulate(DOLLAR, o.part(1));
|
||||
acc.accumulate(GROOVY_REFERENCE, o.part(2));
|
||||
});
|
||||
sc.on(dollarScriptlet).exec(o -> {
|
||||
acc.accumulate(DOLLAR, o.part(1));
|
||||
acc.accumulate(CURLY_OPEN, o.part(2));
|
||||
acc.accumulate(SCRIPTLET, o.part(3));
|
||||
acc.accumulate(CURLY_CLOSE, o.part(4));
|
||||
});
|
||||
sc.on(blockScriptlet).exec(o -> {
|
||||
acc.accumulate(BLOCK_SCRIPTLET_OPEN, o.part(1));
|
||||
acc.accumulate(SCRIPTLET, o.part(2));
|
||||
acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
|
||||
});
|
||||
sc.on(expressionScriptlet).exec(o -> {
|
||||
acc.accumulate(EXPRESSION_SCRIPTLET_OPEN, o.part(1));
|
||||
acc.accumulate(SCRIPTLET, o.part(2));
|
||||
acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
|
||||
});
|
||||
sc.on(forwardSlash).exec(o -> {
|
||||
acc.accumulate(FORWARD_SLASH, o.entire());
|
||||
});
|
||||
sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
|
||||
acc.accumulate(COMPONENT_END, o.entire());
|
||||
});
|
||||
sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
|
||||
})
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package com.jessebrault.gcp.tokenizer;
|
||||
|
||||
enum TokenizerState {
|
||||
NORMAL,
|
||||
COMPONENT,
|
||||
G_STRING,
|
||||
SCRIPTLET
|
||||
COMPONENT_NAME,
|
||||
COMPONENT_KEYS_AND_VALUES
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
package com.jessebrault.gcp.tokenizer
|
||||
|
||||
import org.junit.jupiter.api.Disabled
|
||||
import org.junit.jupiter.api.Test
|
||||
import org.slf4j.Logger
|
||||
import org.slf4j.LoggerFactory
|
||||
@ -81,56 +82,55 @@ class TokenizerTests {
|
||||
}
|
||||
|
||||
@Test
|
||||
void doctypeHtml() {
|
||||
void doctypeHtmlIsText() {
|
||||
test('<!DOCTYPE html>') {
|
||||
expect LESS_THAN, '<', 1, 1
|
||||
expect WORD, '!DOCTYPE', 1, 2
|
||||
expect WHITESPACE, ' ', 1, 10
|
||||
expect IDENTIFIER, 'html', 1, 11
|
||||
expect GREATER_THAN, '>', 1, 15
|
||||
expect TEXT, '<!DOCTYPE html>', 1, 1
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void htmlLangEn() {
|
||||
void htmlLangEnIsText() {
|
||||
test('<html lang="en">') {
|
||||
expect LESS_THAN, '<', 1, 1
|
||||
expect IDENTIFIER, 'html', 1, 2
|
||||
expect WHITESPACE, ' ', 1, 6
|
||||
expect IDENTIFIER, 'lang', 1, 7
|
||||
expect EQUALS, '=', 1, 11
|
||||
expect DOUBLE_QUOTE, '"', 1, 12
|
||||
expect IDENTIFIER, 'en', 1, 13
|
||||
expect DOUBLE_QUOTE, '"', 1, 15
|
||||
expect GREATER_THAN, '>', 1, 16
|
||||
expect TEXT, '<html lang="en">', 1, 1
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void component() {
|
||||
test('<Test />') {
|
||||
expect LESS_THAN, '<', 1, 1
|
||||
expect CAPITALIZED_IDENTIFIER, 'Test', 1, 2
|
||||
expect COMPONENT_START, '<', 1, 1
|
||||
expect CLASS_NAME, 'Test', 1, 2
|
||||
expect WHITESPACE, ' ', 1, 6
|
||||
expect FORWARD_SLASH, '/', 1, 7
|
||||
expect GREATER_THAN, '>', 1, 8
|
||||
expect COMPONENT_END, '>', 1, 8
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
void componentWithKeysAndValues() {
|
||||
test('<Test test="test" />') {
|
||||
expect LESS_THAN, '<', 1, 1
|
||||
expect CAPITALIZED_IDENTIFIER, 'Test', 1, 2
|
||||
expect COMPONENT_START, '<', 1, 1
|
||||
expect CLASS_NAME, 'Test', 1, 2
|
||||
expect WHITESPACE, ' ', 1, 6
|
||||
expect IDENTIFIER, 'test', 1, 7
|
||||
expect KEY, 'test', 1, 7
|
||||
expect EQUALS, '=', 1, 11
|
||||
expect DOUBLE_QUOTE, '"', 1, 12
|
||||
expect STRING, 'test', 1, 13
|
||||
expect DOUBLE_QUOTE, '"', 1, 17
|
||||
expect WHITESPACE, ' ', 1, 18
|
||||
expect FORWARD_SLASH, '/', 1, 19
|
||||
expect GREATER_THAN, '>', 1, 20
|
||||
expect COMPONENT_END, '>', 1, 20
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void newlinesCounted() {
|
||||
test('Hello,\n$person!') {
|
||||
expect TEXT, 'Hello,\n', 1, 1
|
||||
expect DOLLAR, '$', 2, 1
|
||||
expect GROOVY_REFERENCE, 'person', 2, 2
|
||||
expect TEXT, '!', 2, 8
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user