Tokenizer working, more tests to do.

2023-01-25 13:54:23 +01:00 · 2023-01-25 13:54:23 +01:00 · 3fee229003
commit 3fee229003
parent 368e63656e
6 changed files with 220 additions and 138 deletions
--- a/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/Accumulator.java
+++ b/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/Accumulator.java
@ -2,21 +2,29 @@ package com.jessebrault.gcp.tokenizer;

 import java.util.LinkedList;
 import java.util.Queue;
+import java.util.regex.Pattern;

 final class Accumulator {

-    Queue<Token> tokens = new LinkedList<>();
-    int line = 1;
-    int col = 1;
+    private static final Pattern newline = Pattern.compile("([\n\r])");
+
+    private final Queue<Token> tokens = new LinkedList<>();
+    private int line = 1;
+    private int col = 1;

    public void accumulate(Token.Type type, String text) {
        this.tokens.add(new Token(type, text, this.line, this.col));
-        if (type == Token.Type.NEWLINE) {
-            this.line++;
+        final var m = newline.matcher(text);
+        if (m.find()) {
+            this.line += m.groupCount();
            this.col = 1;
        } else {
            this.col += text.length();
        }
    }

+    public Queue<Token> getTokens() {
+        return this.tokens;
+    }
+
 }
--- a/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/Token.java
+++ b/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/Token.java
@ -5,26 +5,33 @@ import java.util.Collection;
 public final class Token {

    public enum Type {
-        LESS_THAN,
-        GREATER_THAN,
-        PERCENT,
-        EQUALS,
-        DOUBLE_QUOTE,
-        SINGLE_QUOTE,
-        FORWARD_SLASH,
-        IDENTIFIER,
-        CAPITALIZED_IDENTIFIER,
-        DOT,
-        WORD,
-        WHITESPACE,
-        NEWLINE,
-        STRING,
-        SCRIPTLET,
+        TEXT,

        DOLLAR,
        GROOVY_REFERENCE,
        CURLY_OPEN,
-        CURLY_CLOSE
+        SCRIPTLET,
+        CURLY_CLOSE,
+        BLOCK_SCRIPTLET_OPEN,
+        EXPRESSION_SCRIPTLET_OPEN,
+        SCRIPTLET_CLOSE,
+
+        CLASS_NAME,
+        PACKAGE_NAME,
+        DOT,
+
+        WHITESPACE,
+
+        KEY,
+        EQUALS,
+
+        DOUBLE_QUOTE,
+        STRING,
+        SINGLE_QUOTE,
+
+        COMPONENT_START,
+        FORWARD_SLASH,
+        COMPONENT_END,
        ;

        boolean isAnyOf(Collection<Type> types) {
--- a/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/Tokenizer.java
+++ b/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/Tokenizer.java
@ -1,82 +1,23 @@
 package com.jessebrault.gcp.tokenizer;

-import java.util.LinkedList;
 import java.util.Queue;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;

 public final class Tokenizer {

-    private static final Pattern lessThan = Pattern.compile("^<");
-    private static final Pattern greaterThan = Pattern.compile("^>");
-    private static final Pattern percent = Pattern.compile("^%");
-    private static final Pattern equals = Pattern.compile("^=");
-    private static final Pattern doubleQuote = Pattern.compile("^\"");
-    private static final Pattern singleQuote = Pattern.compile("^'");
-    private static final Pattern forwardSlash = Pattern.compile("^/");
-    private static final Pattern identifier = Pattern.compile("^[\\p{Ll}0-9_$][\\p{L}0-9_$]*");
-    private static final Pattern capitalizedIdentifier = Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*");
-    private static final Pattern dot = Pattern.compile("^\\.");
-    private static final Pattern word = Pattern.compile("^[\\w\\W&&[^\\s\\n\\r]]+");
-    private static final Pattern whitespace = Pattern.compile("^[\\s&&[^\n\r]]");
-    private static final Pattern newline = Pattern.compile("^[\\n\\r]");
-
    public static Queue<Token> tokenize(final String gcpSrc) {
-        Queue<Token> tokens = new LinkedList<>();
+        final var acc = new Accumulator();
+        final var fsm = TokenizerFsm.get(acc);

-        var line = 1;
-        var col = 1;
-        
        String remaining = gcpSrc;
        while (remaining.length() > 0) {
-            Matcher m;
-            
-            if ((m = lessThan.matcher(remaining)).find()) {
-                tokens.add(new Token(Token.Type.LESS_THAN, m.group(0), line, col));
-                col += m.group(0).length();
-            } else if ((m = greaterThan.matcher(remaining)).find()) {
-                tokens.add(new Token(Token.Type.GREATER_THAN, m.group(0), line, col));
-                col += m.group(0).length();
-            } else if ((m = percent.matcher(remaining)).find()) {
-                tokens.add(new Token(Token.Type.PERCENT, m.group(0), line, col));
-                col += m.group(0).length();
-            } else if ((m = equals.matcher(remaining)).find()) {
-                tokens.add(new Token(Token.Type.EQUALS, m.group(0), line, col));
-                col += m.group(0).length();
-            } else if ((m = doubleQuote.matcher(remaining)).find()) {
-                tokens.add(new Token(Token.Type.DOUBLE_QUOTE, m.group(0), line, col));
-                col += m.group(0).length();
-            } else if ((m = singleQuote.matcher(remaining)).find()) {
-                tokens.add(new Token(Token.Type.SINGLE_QUOTE, m.group(0), line, col));
-                col += m.group(0).length();
-            } else if ((m = identifier.matcher(remaining)).find()) {
-                tokens.add(new Token(Token.Type.IDENTIFIER, m.group(0), line, col));
-                col += m.group(0).length();
-            } else if ((m = forwardSlash.matcher(remaining)).find()) {
-                tokens.add(new Token(Token.Type.FORWARD_SLASH, m.group(0), line, col));
-                col += m.group(0).length();
-            } else if ((m = capitalizedIdentifier.matcher(remaining)).find()) {
-                tokens.add(new Token(Token.Type.CAPITALIZED_IDENTIFIER, m.group(0), line, col));
-                col += m.group(0).length();
-            } else if ((m = dot.matcher(remaining)).find()) {
-                tokens.add(new Token(Token.Type.DOT, m.group(0), line, col));
-                col += m.group(0).length();
-            } else if ((m = word.matcher(remaining)).find()) {
-                tokens.add(new Token(Token.Type.WORD, m.group(0), line, col));
-                col += m.group(0).length();
-            } else if ((m = whitespace.matcher(remaining)).find()) {
-                tokens.add(new Token(Token.Type.WHITESPACE, m.group(0), line, col));
-                col += m.group(0).length();
-            } else if ((m = newline.matcher(remaining)).find()) {
-                tokens.add(new Token(Token.Type.NEWLINE, m.group(0), line, col));
-                col = 1;
-                line++;
+            final var o = fsm.apply(remaining);
+            if (o == null) {
+                throw new IllegalStateException();
            }
-
-            remaining = remaining.substring(m.group(0).length());
+            remaining = remaining.substring(o.entire().length());
        }

-        return tokens;
+        return acc.getTokens();
    }

 }
--- a/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/TokenizerFsm.java
+++ b/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/TokenizerFsm.java
@ -8,39 +8,85 @@ import static com.jessebrault.gcp.tokenizer.Token.Type.*;

 import java.util.regex.Pattern;

-class TokenizerFsm {
+final class TokenizerFsm {

-    private static final PatternMatcher lessThan = new PatternMatcher(Pattern.compile("^<"));
-    private static final PatternMatcher greaterThan = new PatternMatcher(Pattern.compile("^>"));
-    private static final PatternMatcher percent = new PatternMatcher(Pattern.compile("^%"));
-    private static final PatternMatcher equals = new PatternMatcher(Pattern.compile("^="));
-    private static final PatternMatcher forwardSlash = new PatternMatcher(Pattern.compile("^/"));
-    private static final PatternMatcher identifier = new PatternMatcher(Pattern.compile("^[\\p{Ll}0-9_$][\\p{L}0-9_$]*"));
-    private static final PatternMatcher capitalizedIdentifier = new PatternMatcher(Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*"));
-    private static final PatternMatcher dot = new PatternMatcher(Pattern.compile("^\\."));
-    private static final PatternMatcher word = new PatternMatcher(Pattern.compile("^[\\w\\W&&[^\\s\\n\\r]]+"));
-    private static final PatternMatcher whitespace = new PatternMatcher(Pattern.compile("^[\\s&&[^\n\r]]"));
-    private static final PatternMatcher newline = new PatternMatcher(Pattern.compile("^[\\n\\r]"));
-
-    private static final PatternMatcher dollarReference = new PatternMatcher(Pattern.compile("^(\\$)([\\w$]+(?:\\.[\\w$]+)*)"));
-
-    private static final PatternMatcher openingComponentStart = new PatternMatcher(Pattern.compile("^<(?=\\p{Lu}|\\p{L}+(?:\\.\\p{L}+)+)"));
-
-    private static final PatternMatcher doubleQuote = new PatternMatcher(Pattern.compile("^\""));
-    private static final PatternMatcher singleQuote = new PatternMatcher(Pattern.compile("^'"));
+    /**
+     * Text
+     */
+    private static final PatternMatcher text = new PatternMatcher(
+            Pattern.compile("^(?:[\\w\\W&&[^<$]]|<(?!%|/?\\p{Lu}|/?[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)|\\$(?![\\w$]+(?:\\.[\\w$]+)*))+")
+    );

+    /**
+     * Gsp dollar reference and scriptlets, also used as component values
+     */
+    private static final PatternMatcher dollarReference = new PatternMatcher(
+            Pattern.compile("^(\\$)([\\w$]+(?:\\.[\\w$]+)*)")
+    );
    private static final DollarScriptletMatcher dollarScriptlet = new DollarScriptletMatcher();
+    private static final PatternMatcher blockScriptlet = new PatternMatcher(
+            Pattern.compile("^(<%)(.*?)(%>)")
+    );
+    private static final PatternMatcher expressionScriptlet = new PatternMatcher(
+            Pattern.compile("^(<%=)(.*?)(%>)")
+    );
+
+    /**
+     * Component starts
+     */
+    private static final PatternMatcher openingComponentStart = new PatternMatcher(
+            Pattern.compile("^<(?=\\p{Lu}|[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)")
+    );
+    private static final PatternMatcher closingComponentStart = new PatternMatcher(
+            Pattern.compile("^(<)(/)(?=\\p{Lu}|[\\p{L}0-9_$]+(?:\\.[\\p{L}0-9_$]+)+)")
+    );
+
+    /**
+     * Component names
+     */
+    private static final PatternMatcher className = new PatternMatcher(
+            Pattern.compile("^\\p{Lu}[\\p{L}0-9_$]*")
+    );
+    private static final PatternMatcher packageName = new PatternMatcher(
+            Pattern.compile("^[\\p{L}0-9_$]+(?=\\.)")
+    );
+    private static final PatternMatcher dot = new PatternMatcher(
+            Pattern.compile("^\\.")
+    );
+
+    /**
+     * Whitespace
+     */
+    private static final PatternMatcher whitespace = new PatternMatcher(Pattern.compile("^[\\s&&[^\n\r]]+"));
+
+    /**
+     * Keys and values
+     */
+    private static final PatternMatcher key = new PatternMatcher(
+            Pattern.compile("^[\\p{L}0-9_$]+")
+    );
+    private static final PatternMatcher equals = new PatternMatcher(Pattern.compile("^="));
+
+    /**
+     * Component ends
+     */
+    private static final PatternMatcher forwardSlash = new PatternMatcher(Pattern.compile("^/"));
+    private static final PatternMatcher componentEnd = new PatternMatcher(Pattern.compile("^>"));

    private static FunctionFsmBuilder<String, TokenizerState, FsmOutput> getFsmBuilder() {
        return new FunctionFsmBuilderImpl<>();
    }

-    public FunctionFsm<String, TokenizerState, FsmOutput> getFsm(Accumulator acc) {
+    public static FunctionFsm<String, TokenizerState, FsmOutput> get(Accumulator acc) {
        return getFsmBuilder()
+                .setInitialState(TokenizerState.NORMAL)
                .whileIn(TokenizerState.NORMAL, sc -> {
-                    sc.on(dollarReference).exec(r -> {
-                        acc.accumulate(DOLLAR, r.part(1));
-                        acc.accumulate(GROOVY_REFERENCE, r.part(2));
+                    sc.on(text).exec(o -> {
+                        acc.accumulate(TEXT, o.entire());
+                    });
+                    sc.on(dollarReference).exec(o -> {
+                        acc.accumulate(DOLLAR, o.part(1));
+                        acc.accumulate(GROOVY_REFERENCE, o.part(2));
                    });
                    sc.on(dollarScriptlet).exec(o -> {
                        acc.accumulate(DOLLAR, o.part(1));
@ -48,11 +94,92 @@ class TokenizerFsm {
                        acc.accumulate(SCRIPTLET, o.part(3));
                        acc.accumulate(CURLY_CLOSE, o.part(4));
                    });
-                    sc.on(openingComponentStart).shiftTo(TokenizerState.COMPONENT).exec(r -> {
-                        acc.accumulate(LESS_THAN, r.entire());
+                    sc.on(blockScriptlet).exec(o -> {
+                        acc.accumulate(BLOCK_SCRIPTLET_OPEN, o.part(1));
+                        acc.accumulate(SCRIPTLET, o.part(2));
+                        acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
                    });
+                    sc.on(expressionScriptlet).exec(o -> {
+                        acc.accumulate(EXPRESSION_SCRIPTLET_OPEN, o.part(1));
+                        acc.accumulate(SCRIPTLET, o.part(2));
+                        acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
+                    });
+                    sc.on(openingComponentStart).shiftTo(TokenizerState.COMPONENT_NAME).exec(o ->
+                        acc.accumulate(COMPONENT_START, o.entire())
+                    );
+                    sc.on(closingComponentStart).shiftTo(TokenizerState.COMPONENT_NAME).exec(o -> {
+                        acc.accumulate(COMPONENT_START, o.part(1));
+                        acc.accumulate(FORWARD_SLASH, o.part(2));
+                    });
+                    sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
+                })
+                .whileIn(TokenizerState.COMPONENT_NAME, sc -> {
+                    sc.on(packageName).exec(o -> {
+                       acc.accumulate(PACKAGE_NAME, o.entire());
+                    });
+                    sc.on(dot).exec(o -> {
+                        acc.accumulate(DOT, o.entire());
+                    });
+                    sc.on(className).exec(o -> {
+                        acc.accumulate(CLASS_NAME, o.entire());
+                    });
+                    sc.on(forwardSlash).exec(o -> {
+                        acc.accumulate(FORWARD_SLASH, o.entire());
+                    });
+                    sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
+                       acc.accumulate(COMPONENT_END, o.entire());
+                    });
+                    sc.on(whitespace).shiftTo(TokenizerState.COMPONENT_KEYS_AND_VALUES).exec(o -> {
+                        acc.accumulate(WHITESPACE, o.entire());
+                    });
+                    sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
+                })
+                .whileIn(TokenizerState.COMPONENT_KEYS_AND_VALUES, sc -> {
+                    sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
+                        acc.accumulate(COMPONENT_END, o.entire());
+                    });
+                    sc.on(whitespace).exec(o -> {
+                        acc.accumulate(WHITESPACE, o.entire());
+                    });
+                    sc.on(key).exec(o -> {
+                        acc.accumulate(KEY, o.entire());
+                    });
+                    sc.on(equals).exec(o -> {
+                       acc.accumulate(EQUALS, o.entire());
+                    });
+                    // sc.on(gString)
+                    // sc.on(singleQuoteString)
+                    sc.on(dollarReference).exec(o -> {
+                        acc.accumulate(DOLLAR, o.part(1));
+                        acc.accumulate(GROOVY_REFERENCE, o.part(2));
+                    });
+                    sc.on(dollarScriptlet).exec(o -> {
+                        acc.accumulate(DOLLAR, o.part(1));
+                        acc.accumulate(CURLY_OPEN, o.part(2));
+                        acc.accumulate(SCRIPTLET, o.part(3));
+                        acc.accumulate(CURLY_CLOSE, o.part(4));
+                    });
+                    sc.on(blockScriptlet).exec(o -> {
+                        acc.accumulate(BLOCK_SCRIPTLET_OPEN, o.part(1));
+                        acc.accumulate(SCRIPTLET, o.part(2));
+                        acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
+                    });
+                    sc.on(expressionScriptlet).exec(o -> {
+                        acc.accumulate(EXPRESSION_SCRIPTLET_OPEN, o.part(1));
+                        acc.accumulate(SCRIPTLET, o.part(2));
+                        acc.accumulate(SCRIPTLET_CLOSE, o.part(3));
+                    });
+                    sc.on(forwardSlash).exec(o -> {
+                        acc.accumulate(FORWARD_SLASH, o.entire());
+                    });
+                    sc.on(componentEnd).shiftTo(TokenizerState.NORMAL).exec(o -> {
+                        acc.accumulate(COMPONENT_END, o.entire());
+                    });
+                    sc.onNoMatch().exec(input -> { throw new IllegalArgumentException(); });
                })
                .build();
    }
+
+
    
 }
--- a/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/TokenizerState.java
+++ b/gcp-impl/src/main/groovy/com/jessebrault/gcp/tokenizer/TokenizerState.java
@ -2,7 +2,6 @@ package com.jessebrault.gcp.tokenizer;

 enum TokenizerState {
    NORMAL,
-    COMPONENT,
-    G_STRING,
-    SCRIPTLET
+    COMPONENT_NAME,
+    COMPONENT_KEYS_AND_VALUES
 }
--- a/gcp-impl/src/test/groovy/com/jessebrault/gcp/tokenizer/TokenizerTests.groovy
+++ b/gcp-impl/src/test/groovy/com/jessebrault/gcp/tokenizer/TokenizerTests.groovy
@ -1,5 +1,6 @@
 package com.jessebrault.gcp.tokenizer

+import org.junit.jupiter.api.Disabled
 import org.junit.jupiter.api.Test
 import org.slf4j.Logger
 import org.slf4j.LoggerFactory
@ -81,56 +82,55 @@ class TokenizerTests {
    }

    @Test
-    void doctypeHtml() {
+    void doctypeHtmlIsText() {
        test('<!DOCTYPE html>') {
-            expect LESS_THAN, '<', 1, 1
-            expect WORD, '!DOCTYPE', 1, 2
-            expect WHITESPACE, ' ', 1, 10
-            expect IDENTIFIER, 'html', 1, 11
-            expect GREATER_THAN, '>', 1, 15
+            expect TEXT, '<!DOCTYPE html>', 1, 1
        }
    }

    @Test
-    void htmlLangEn() {
+    void htmlLangEnIsText() {
        test('<html lang="en">') {
-            expect LESS_THAN, '<', 1, 1
-            expect IDENTIFIER, 'html', 1, 2
-            expect WHITESPACE, ' ', 1, 6
-            expect IDENTIFIER, 'lang', 1, 7
-            expect EQUALS, '=', 1, 11
-            expect DOUBLE_QUOTE, '"', 1, 12
-            expect IDENTIFIER, 'en', 1, 13
-            expect DOUBLE_QUOTE, '"', 1, 15
-            expect GREATER_THAN, '>', 1, 16
+            expect TEXT, '<html lang="en">', 1, 1
        }
    }

    @Test
    void component() {
        test('<Test />') {
-            expect LESS_THAN, '<', 1, 1
-            expect CAPITALIZED_IDENTIFIER, 'Test', 1, 2
+            expect COMPONENT_START, '<', 1, 1
+            expect CLASS_NAME, 'Test', 1, 2
            expect WHITESPACE, ' ', 1, 6
            expect FORWARD_SLASH, '/', 1, 7
-            expect GREATER_THAN, '>', 1, 8
+            expect COMPONENT_END, '>', 1, 8
        }
    }

    @Test
+    @Disabled
    void componentWithKeysAndValues() {
        test('<Test test="test" />') {
-            expect LESS_THAN, '<', 1, 1
-            expect CAPITALIZED_IDENTIFIER, 'Test', 1, 2
+            expect COMPONENT_START, '<', 1, 1
+            expect CLASS_NAME, 'Test', 1, 2
            expect WHITESPACE, ' ', 1, 6
-            expect IDENTIFIER, 'test', 1, 7
+            expect KEY, 'test', 1, 7
            expect EQUALS, '=', 1, 11
            expect DOUBLE_QUOTE, '"', 1, 12
            expect STRING, 'test', 1, 13
            expect DOUBLE_QUOTE, '"', 1, 17
            expect WHITESPACE, ' ', 1, 18
            expect FORWARD_SLASH, '/', 1, 19
-            expect GREATER_THAN, '>', 1, 20
+            expect COMPONENT_END, '>', 1, 20
+        }
+    }
+
+    @Test
+    void newlinesCounted() {
+        test('Hello,\n$person!') {
+            expect TEXT, 'Hello,\n', 1, 1
+            expect DOLLAR, '$', 2, 1
+            expect GROOVY_REFERENCE, 'person', 2, 2
+            expect TEXT, '!', 2, 8
        }
    }