Tokenizer working, more tests to do.

2023-01-25 13:54:23 +01:00 · 2023-01-25 13:54:23 +01:00 · 3fee229003
commit 3fee229003
parent 368e63656e
6 changed files with 220 additions and 138 deletions
--- a/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/Accumulator.java
+++ b/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/Accumulator.java
@ -2,21 +2,29 @@ package com.jessebrault.gcp.tokenizer;
 import java.util.LinkedList;
 import java.util.Queue;
 import java.util.regex.Pattern;
 final class Accumulator {
-    Queue<Token> tokens = new LinkedList<>();
+    private static final Pattern newline = Pattern.compile("([\n\r])");
-    int line = 1;
+
-    int col = 1;
+    private final Queue<Token> tokens = new LinkedList<>();
    private int line = 1;
    private int col = 1;
    public void accumulate(Token.Type type, String text) {
        this.tokens.add(new Token(type, text, this.line, this.col));
-        if (type == Token.Type.NEWLINE) {
+        final var m = newline.matcher(text);
-            this.line++;
+        if (m.find()) {
            this.line += m.groupCount();
            this.col = 1;
        } else {
            this.col += text.length();
        }
    }
    public Queue<Token> getTokens() {
        return this.tokens;
    }
 }
--- a/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/Token.java
+++ b/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/Token.java
@ -5,26 +5,33 @@ import java.util.Collection;
 public final class Token {
    public enum Type {
-        LESS_THAN,
+        TEXT,
        GREATER_THAN,
        PERCENT,
        EQUALS,
        DOUBLE_QUOTE,
        SINGLE_QUOTE,
        FORWARD_SLASH,
        IDENTIFIER,
        CAPITALIZED_IDENTIFIER,
        DOT,
        WORD,
        WHITESPACE,
        NEWLINE,
        STRING,
        SCRIPTLET,
        DOLLAR,
        GROOVY_REFERENCE,
        CURLY_OPEN,
-        CURLY_CLOSE
+        SCRIPTLET,
        CURLY_CLOSE,
        BLOCK_SCRIPTLET_OPEN,
        EXPRESSION_SCRIPTLET_OPEN,
        SCRIPTLET_CLOSE,
        CLASS_NAME,
        PACKAGE_NAME,
        DOT,
        WHITESPACE,
        KEY,
        EQUALS,
        DOUBLE_QUOTE,
        STRING,
        SINGLE_QUOTE,
        COMPONENT_START,
        FORWARD_SLASH,
        COMPONENT_END,
        ;
        boolean isAnyOf(Collection<Type> types) {
--- a/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/Tokenizer.java
+++ b/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/Tokenizer.java
@ -1,82 +1,23 @@
 package com.jessebrault.gcp.tokenizer;
 import java.util.LinkedList;
 import java.util.Queue;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 public final class Tokenizer {
    private static final Pattern lessThan = Pattern.compile("^<");
    private static final Pattern greaterThan = Pattern.compile("^>");
    private static final Pattern percent = Pattern.compile("^%");
    private static final Pattern equals = Pattern.compile("^=");
    private static final Pattern doubleQuote = Pattern.compile("^\"");
    private static final Pattern singleQuote = Pattern.compile("^'");
    private static final Pattern forwardSlash = Pattern.compile("^/");
    private static final Pattern identifier = Pattern.compile("^[\\p{Ll}0-9_$][\\p{L}0-9_$]*");
    private static final Pattern capitalizedIdentifier = Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*");
    private static final Pattern dot = Pattern.compile("^\\.");
    private static final Pattern word = Pattern.compile("^[\\w\\W&&[^\\s\\n\\r]]+");
    private static final Pattern whitespace = Pattern.compile("^[\\s&&[^\n\r]]");
    private static final Pattern newline = Pattern.compile("^[\\n\\r]");
    public static Queue<Token> tokenize(final String gcpSrc) {
-        Queue<Token> tokens = new LinkedList<>();
+        final var acc = new Accumulator();
-
+        final var fsm = TokenizerFsm.get(acc);
        var line = 1;
        var col = 1;
        String remaining = gcpSrc;
        while (remaining.length() > 0) {
-            Matcher m;
+            final var o = fsm.apply(remaining);
-            
+            if (o == null) {
-            if ((m = lessThan.matcher(remaining)).find()) {
+                throw new IllegalStateException();
-                tokens.add(new Token(Token.Type.LESS_THAN, m.group(0), line, col));
+            }
-                col += m.group(0).length();
+            remaining = remaining.substring(o.entire().length());
            } else if ((m = greaterThan.matcher(remaining)).find()) {
                tokens.add(new Token(Token.Type.GREATER_THAN, m.group(0), line, col));
                col += m.group(0).length();
            } else if ((m = percent.matcher(remaining)).find()) {
                tokens.add(new Token(Token.Type.PERCENT, m.group(0), line, col));
                col += m.group(0).length();
            } else if ((m = equals.matcher(remaining)).find()) {
                tokens.add(new Token(Token.Type.EQUALS, m.group(0), line, col));
                col += m.group(0).length();
            } else if ((m = doubleQuote.matcher(remaining)).find()) {
                tokens.add(new Token(Token.Type.DOUBLE_QUOTE, m.group(0), line, col));
                col += m.group(0).length();
            } else if ((m = singleQuote.matcher(remaining)).find()) {
                tokens.add(new Token(Token.Type.SINGLE_QUOTE, m.group(0), line, col));
                col += m.group(0).length();
            } else if ((m = identifier.matcher(remaining)).find()) {
                tokens.add(new Token(Token.Type.IDENTIFIER, m.group(0), line, col));
                col += m.group(0).length();
            } else if ((m = forwardSlash.matcher(remaining)).find()) {
                tokens.add(new Token(Token.Type.FORWARD_SLASH, m.group(0), line, col));
                col += m.group(0).length();
            } else if ((m = capitalizedIdentifier.matcher(remaining)).find()) {
                tokens.add(new Token(Token.Type.CAPITALIZED_IDENTIFIER, m.group(0), line, col));
                col += m.group(0).length();
            } else if ((m = dot.matcher(remaining)).find()) {
                tokens.add(new Token(Token.Type.DOT, m.group(0), line, col));
                col += m.group(0).length();
            } else if ((m = word.matcher(remaining)).find()) {
                tokens.add(new Token(Token.Type.WORD, m.group(0), line, col));
                col += m.group(0).length();
            } else if ((m = whitespace.matcher(remaining)).find()) {
                tokens.add(new Token(Token.Type.WHITESPACE, m.group(0), line, col));
                col += m.group(0).length();
            } else if ((m = newline.matcher(remaining)).find()) {
                tokens.add(new Token(Token.Type.NEWLINE, m.group(0), line, col));
                col = 1;
                line++;
        }
-            remaining = remaining.substring(m.group(0).length());
+        return acc.getTokens();
        }
        return tokens;
    }
 }
--- a/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/TokenizerFsm.java
+++ b/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/TokenizerFsm.java
@ -8,39 +8,85 @@ import static com.jessebrault.gcp.tokenizer.Token.Type.*;
 import java.util.regex.Pattern;
-class TokenizerFsm {
+final class TokenizerFsm {
-    private static final PatternMatcher lessThan = new PatternMatcher(Pattern.compile("^<"));
+    /**
-    private static final PatternMatcher greaterThan = new PatternMatcher(Pattern.compile("^>"));
+     * Text
-    private static final PatternMatcher percent = new PatternMatcher(Pattern.compile("^%"));
+     */
-    private static final PatternMatcher equals = new PatternMatcher(Pattern.compile("^="));
+    private static final PatternMatcher text = new PatternMatcher(
-    private static final PatternMatcher forwardSlash = new PatternMatcher(Pattern.compile("^/"));
+            Pattern.compile("^(?:[\\w\\W&&[^<$]]|<(?!%|/?\\p{Lu}|/?[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)|\\$(?![\\w$]+(?:\\.[\\w$]+)*))+")
-    private static final PatternMatcher identifier = new PatternMatcher(Pattern.compile("^[\\p{Ll}0-9_$][\\p{L}0-9_$]*"));
+    );
    private static final PatternMatcher capitalizedIdentifier = new PatternMatcher(Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*"));
    private static final PatternMatcher dot = new PatternMatcher(Pattern.compile("^\\."));
    private static final PatternMatcher word = new PatternMatcher(Pattern.compile("^[\\w\\W&&[^\\s\\n\\r]]+"));
    private static final PatternMatcher whitespace = new PatternMatcher(Pattern.compile("^[\\s&&[^\n\r]]"));
    private static final PatternMatcher newline = new PatternMatcher(Pattern.compile("^[\\n\\r]"));
    private static final PatternMatcher dollarReference = new PatternMatcher(Pattern.compile("^(\\$)([\\w$]+(?:\\.[\\w$]+)*)"));
    private static final PatternMatcher openingComponentStart = new PatternMatcher(Pattern.compile("^<(?=\\p{Lu}|\\p{L}+(?:\\.\\p{L}+)+)"));
    private static final PatternMatcher doubleQuote = new PatternMatcher(Pattern.compile("^\""));
    private static final PatternMatcher singleQuote = new PatternMatcher(Pattern.compile("^'"));
    /**
     * Gsp dollar reference and scriptlets, also used as component values
     */
    private static final PatternMatcher dollarReference = new PatternMatcher(
            Pattern.compile("^(\\$)([\\w$]+(?:\\.[\\w$]+)*)")
    );
    private static final DollarScriptletMatcher dollarScriptlet = new DollarScriptletMatcher();
    private static final PatternMatcher blockScriptlet = new PatternMatcher(
            Pattern.compile("^(<%)(.*?)(%>)")
    );
    private static final PatternMatcher expressionScriptlet = new PatternMatcher(
            Pattern.compile("^(<%=)(.*?)(%>)")
    );
    /**
     * Component starts
     */
    private static final PatternMatcher openingComponentStart = new PatternMatcher(
            Pattern.compile("^<(?=\\p{Lu}|[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)")
    );
    private static final PatternMatcher closingComponentStart = new PatternMatcher(
            Pattern.compile("^(<)(/)(?=\\p{Lu}|[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)")
    );
    /**
     * Component names
     */
    private static final PatternMatcher className = new PatternMatcher(
            Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*")
    );
    private static final PatternMatcher packageName = new PatternMatcher(
            Pattern.compile("^[\\p{L}0-9_$]+(?=\\.)")
    );
    private static final PatternMatcher dot = new PatternMatcher(
            Pattern.compile("^\\.")
    );
    /**
     * Whitespace
     */
    private static final PatternMatcher whitespace = new PatternMatcher(Pattern.compile("^[\\s&&[^\n\r]]+"));
    /**
     * Keys and values
     */
    private static final PatternMatcher key = new PatternMatcher(
            Pattern.compile("^[\\p{L}0-9_$]+")
    );
    private static final PatternMatcher equals = new PatternMatcher(Pattern.compile("^="));
    /**
     * Component ends
     */
    private static final PatternMatcher forwardSlash = new PatternMatcher(Pattern.compile("^/"));
    private static final PatternMatcher componentEnd = new PatternMatcher(Pattern.compile("^>"));
    private static FunctionFsmBuilder<String, TokenizerState, FsmOutput> getFsmBuilder() {
        return new FunctionFsmBuilderImpl<>();
    }
-    public FunctionFsm<String, TokenizerState, FsmOutput> getFsm(Accumulator acc) {
+    public static FunctionFsm<String, TokenizerState, FsmOutput> get(Accumulator acc) {
        return getFsmBuilder()
                .setInitialState(TokenizerState.NORMAL)
                .whileIn(TokenizerState.NORMAL, sc -> {
-                    sc.on(dollarReference).exec(r -> {
+                    sc.on(text).exec(o -> {
-                        acc.accumulate(DOLLAR, r.part(1));
+                        acc.accumulate(TEXT, o.entire());
-                        acc.accumulate(GROOVY_REFERENCE, r.part(2));
+                    });
                    sc.on(dollarReference).exec(o -> {
                        acc.accumulate(DOLLAR, o.part(1));
                        acc.accumulate(GROOVY_REFERENCE, o.part(2));
                    });
                    sc.on(dollarScriptlet).exec(o -> {
                        acc.accumulate(DOLLAR, o.part(1));
@ -48,11 +94,92 @@ class TokenizerFsm {
                        acc.accumulate(SCRIPTLET, o.part(3));
                        acc.accumulate(CURLY_CLOSE, o.part(4));
                    });
-                    sc.on(openingComponentStart).shiftTo(TokenizerState.COMPONENT).exec(r -> {
+                    sc.on(blockScriptlet).exec(o -> {
-                        acc.accumulate(LESS_THAN, r.entire());
+                        acc.accumulate(BLOCK_SCRIPTLET_OPEN, o.part(1));
                        acc.accumulate(SCRIPTLET, o.part(2));
                        acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
                    });
                    sc.on(expressionScriptlet).exec(o -> {
                        acc.accumulate(EXPRESSION_SCRIPTLET_OPEN, o.part(1));
                        acc.accumulate(SCRIPTLET, o.part(2));
                        acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
                    });
                    sc.on(openingComponentStart).shiftTo(TokenizerState.COMPONENT_NAME).exec(o ->
                        acc.accumulate(COMPONENT_START, o.entire())
                    );
                    sc.on(closingComponentStart).shiftTo(TokenizerState.COMPONENT_NAME).exec(o -> {
                        acc.accumulate(COMPONENT_START, o.part(1));
                        acc.accumulate(FORWARD_SLASH, o.part(2));
                    });
                    sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
                })
                .whileIn(TokenizerState.COMPONENT_NAME, sc -> {
                    sc.on(packageName).exec(o -> {
                       acc.accumulate(PACKAGE_NAME, o.entire());
                    });
                    sc.on(dot).exec(o -> {
                        acc.accumulate(DOT, o.entire());
                    });
                    sc.on(className).exec(o -> {
                        acc.accumulate(CLASS_NAME, o.entire());
                    });
                    sc.on(forwardSlash).exec(o -> {
                        acc.accumulate(FORWARD_SLASH, o.entire());
                    });
                    sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
                       acc.accumulate(COMPONENT_END, o.entire());
                    });
                    sc.on(whitespace).shiftTo(TokenizerState.COMPONENT_KEYS_AND_VALUES).exec(o -> {
                        acc.accumulate(WHITESPACE, o.entire());
                    });
                    sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
                })
                .whileIn(TokenizerState.COMPONENT_KEYS_AND_VALUES, sc -> {
                    sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
                        acc.accumulate(COMPONENT_END, o.entire());
                    });
                    sc.on(whitespace).exec(o -> {
                        acc.accumulate(WHITESPACE, o.entire());
                    });
                    sc.on(key).exec(o -> {
                        acc.accumulate(KEY, o.entire());
                    });
                    sc.on(equals).exec(o -> {
                       acc.accumulate(EQUALS, o.entire());
                    });
                    // sc.on(gString)
                    // sc.on(singleQuoteString)
                    sc.on(dollarReference).exec(o -> {
                        acc.accumulate(DOLLAR, o.part(1));
                        acc.accumulate(GROOVY_REFERENCE, o.part(2));
                    });
                    sc.on(dollarScriptlet).exec(o -> {
                        acc.accumulate(DOLLAR, o.part(1));
                        acc.accumulate(CURLY_OPEN, o.part(2));
                        acc.accumulate(SCRIPTLET, o.part(3));
                        acc.accumulate(CURLY_CLOSE, o.part(4));
                    });
                    sc.on(blockScriptlet).exec(o -> {
                        acc.accumulate(BLOCK_SCRIPTLET_OPEN, o.part(1));
                        acc.accumulate(SCRIPTLET, o.part(2));
                        acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
                    });
                    sc.on(expressionScriptlet).exec(o -> {
                        acc.accumulate(EXPRESSION_SCRIPTLET_OPEN, o.part(1));
                        acc.accumulate(SCRIPTLET, o.part(2));
                        acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
                    });
                    sc.on(forwardSlash).exec(o -> {
                        acc.accumulate(FORWARD_SLASH, o.entire());
                    });
                    sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
                        acc.accumulate(COMPONENT_END, o.entire());
                    });
                    sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
                })
                .build();
    }
 }
--- a/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/TokenizerState.java
+++ b/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/TokenizerState.java
@ -2,7 +2,6 @@ package com.jessebrault.gcp.tokenizer;
 enum TokenizerState {
    NORMAL,
-    COMPONENT,
+    COMPONENT_NAME,
-    G_STRING,
+    COMPONENT_KEYS_AND_VALUES
    SCRIPTLET
 }
--- a/gcp-impl/src/test/groovy/com/jessebrault/gcp/tokenizer/TokenizerTests.groovy
+++ b/gcp-impl/src/test/groovy/com/jessebrault/gcp/tokenizer/TokenizerTests.groovy
@ -1,5 +1,6 @@
 package com.jessebrault.gcp.tokenizer
 import org.junit.jupiter.api.Disabled
 import org.junit.jupiter.api.Test
 import org.slf4j.Logger
 import org.slf4j.LoggerFactory
@ -81,56 +82,55 @@ class TokenizerTests {
    }
    @Test
-    void doctypeHtml() {
+    void doctypeHtmlIsText() {
        test('<!DOCTYPE html>') {
-            expect LESS_THAN, '<', 1, 1
+            expect TEXT, '<!DOCTYPE html>', 1, 1
            expect WORD, '!DOCTYPE', 1, 2
            expect WHITESPACE, ' ', 1, 10
            expect IDENTIFIER, 'html', 1, 11
            expect GREATER_THAN, '>', 1, 15
        }
    }
    @Test
-    void htmlLangEn() {
+    void htmlLangEnIsText() {
        test('<html lang="en">') {
-            expect LESS_THAN, '<', 1, 1
+            expect TEXT, '<html lang="en">', 1, 1
            expect IDENTIFIER, 'html', 1, 2
            expect WHITESPACE, ' ', 1, 6
            expect IDENTIFIER, 'lang', 1, 7
            expect EQUALS, '=', 1, 11
            expect DOUBLE_QUOTE, '"', 1, 12
            expect IDENTIFIER, 'en', 1, 13
            expect DOUBLE_QUOTE, '"', 1, 15
            expect GREATER_THAN, '>', 1, 16
        }
    }
    @Test
    void component() {
        test('<Test />') {
-            expect LESS_THAN, '<', 1, 1
+            expect COMPONENT_START, '<', 1, 1
-            expect CAPITALIZED_IDENTIFIER, 'Test', 1, 2
+            expect CLASS_NAME, 'Test', 1, 2
            expect WHITESPACE, ' ', 1, 6
            expect FORWARD_SLASH, '/', 1, 7
-            expect GREATER_THAN, '>', 1, 8
+            expect COMPONENT_END, '>', 1, 8
        }
    }
    @Test
    @Disabled
    void componentWithKeysAndValues() {
        test('<Test test="test" />') {
-            expect LESS_THAN, '<', 1, 1
+            expect COMPONENT_START, '<', 1, 1
-            expect CAPITALIZED_IDENTIFIER, 'Test', 1, 2
+            expect CLASS_NAME, 'Test', 1, 2
            expect WHITESPACE, ' ', 1, 6
-            expect IDENTIFIER, 'test', 1, 7
+            expect KEY, 'test', 1, 7
            expect EQUALS, '=', 1, 11
            expect DOUBLE_QUOTE, '"', 1, 12
            expect STRING, 'test', 1, 13
            expect DOUBLE_QUOTE, '"', 1, 17
            expect WHITESPACE, ' ', 1, 18
            expect FORWARD_SLASH, '/', 1, 19
-            expect GREATER_THAN, '>', 1, 20
+            expect COMPONENT_END, '>', 1, 20
        }
    }
    @Test
    void newlinesCounted() {
        test('Hello,\n$person!') {
            expect TEXT, 'Hello,\n', 1, 1
            expect DOLLAR, '$', 2, 1
            expect GROOVY_REFERENCE, 'person', 2, 2
            expect TEXT, '!', 2, 8
        }
    }