SWAT-engineering · sungshik · Sep 6, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024
diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc
@@ -31,15 +31,6 @@ bool tryParse(Grammar g, Symbol s, str input, bool allowAmbiguity = false) {
     return false;
 }
 
-@synopsis{
-    Gets the terminals that occur in production `p`, possibly recursively
-    (default: `true`)
-}
-
-set[Symbol] getTerminals(Grammar g, Production p, bool recur = true)
-    = {s | s <- p.symbols, !isNonTerminalType(s)}
-    + {*getTerminals(g, child) | recur, s <- p.symbols, child <- lookup(g, s)};
-
 @synopsis{
     Lookups a list of productions for symbol `s` in grammar `g`, replacing
     formal parameters with actual parameters when needed
@@ -84,21 +75,26 @@ Symbol expand(\iter-star-seps(symbol, separators))
     Removes the label from symbol `s`, if any
 }
 
-Symbol delabel(label(_, Symbol s)) = s;
-default Symbol delabel(Symbol s)   = s;
+Symbol delabel(\label(_, Symbol s)) = delabel(s);
+default Symbol delabel(Symbol s)    = s;
 
 @synopsis{
     Removes operators `?` and `*` from symbol `s`, if any
 }
 
-Symbol destar(label(name, symbol))
+Symbol destar(\label(name, symbol))
     = label(name, destar(symbol));
+
 Symbol destar(\opt(symbol))
     = destar(symbol);
 Symbol destar(\iter-star(symbol))
     = \iter(destar(symbol));
 Symbol destar(\iter-star-seps(symbol, separators))
     = \iter-seps(destar(symbol), separators);
+Symbol destar(\seq([symbol]))
+    = \seq([destar(symbol)]);
+Symbol destar(\alt({symbol}))
+    = \alt({destar(symbol)});
 
 default Symbol destar(Symbol s) = s;
 

diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc
@@ -7,17 +7,129 @@ module lang::rascal::grammar::analyze::Newlines
 import Grammar;
 import ParseTree;
 import String;
+import util::Maybe;
 
 import lang::rascal::grammar::Util;
+import util::MaybeUtil;
+
+@synopsis{
+    Representation of a *newline-free* segment of symbols
+}
+
+alias Segment = list[Symbol];
+
+@synopsis{
+    Gets the (newline-free) segments of a production/list of symbols in grammar
+    `g`, separated by symbols that have a newline (not part of any segment),
+    recursively for non-terminals. For instance, the segments of
+    `[lit("foo"), lit("bar"), lit("\n"), lit("baz")]` are:
+      - `[lit("foo"), lit("bar")]`;
+      - `[lit("baz")]`.
+}
+
+set[Segment] getSegments(Grammar g, Production p) {
+    return unmaybe(getSegmentsByProduction(g)[p]);
+}
+
+set[Segment] getSegments(Grammar g, list[Symbol] symbols) {
+    map[Production, Maybe[set[Segment]]] env = getSegmentsByProduction(g);
+    return unmaybe(getSegmentsWithEnvironment(g, symbols, env));
+}
+
+@memo
+private map[Production, Maybe[set[Segment]]] getSegmentsByProduction(Grammar g) {
+    map[Production, Maybe[set[Segment]]] ret = (p : nothing() | /p: prod(_, _, _) := g);
+
+    solve (ret) {
+        for (p <- ret, nothing() == ret[p]) {
+            ret[p] = getSegmentsWithEnvironment(g, p.symbols, ret);
+        }
+    }
+
+    return ret;
+}
+
+private Maybe[set[Segment]] getSegmentsWithEnvironment(
+        Grammar g, list[Symbol] symbols, 
+        map[Production, Maybe[set[Segment]]] env) {
+
+    // General idea: Recursively traverse `symbols` from left to right, while
+    // keeping track of a "running segment" (initially empty). Each time a
+    // symbol that has a newline is encountered, finish/collect the running
+    // segment, and start a new one for the remainder of `symbols`.
+
+    // Final case: No symbols remaining
+    Maybe[set[Segment]] get(Segment runningSegment, []) {
+        return just(_ <- runningSegment ? {runningSegment} : {});
+    }
+
+    // Recursive case: At least one symbol remaining
+    Maybe[set[Segment]] get(Segment segment, [Symbol head, *Symbol tail]) {
+        set[Symbol] nested = {s | /Symbol s := head};
+
+        // If the head contains a non-terminal, then: (1) finish the running
+        // segment; (2) lookup the segments of the non-terminals in the
+        // environment, if any; (3) compute the segments of the tail. Return the
+        // union of 1-3.
+        if (any(s <- nested, isNonTerminalType(s))) {
+
+            list[Maybe[set[Segment]]] sets
+                = [get(segment, [])] // (1)
+                + [env[p] | s <- nested, isNonTerminalType(s), p <- lookup(g, s)] // (2)
+                + [get([], tail)]; // (3)
+
+            return (sets[0] | union(it, \set) | \set <- sets[1..]);
+
+        }
+
+        // If the head doesn't contain a non-terminal, but it has a newline,
+        // then: (1) finish the running segment; (2) compute the segments of the
+        // tail. Return the union of 1-2. Note: the head is ignored and won't be
+        // part of any segment.
+        else if (any(s <- nested, hasNewline(g, s))) {
+            return union(get(segment, []), get([], tail));
+        }
+
+        // If the head doesn't contain a non-terminal, and if it doesn't have a
+        // newline, then add the head to the running segment and proceed with
+        // the tail.
+        else {
+            return get(segment + head, tail);
+        }
+    }
+
+    return get([], symbols);
+}
+
+@synopsis{
+    Checks if a symbol has a newline character
+}
+
+bool hasNewline(Grammar g, Symbol s) {
+    return any(p <- lookup(g, delabel(s)), hasNewline(g, p));
+}
 
 @synopsis{
     Checks if a production has a newline character
 }
 
-bool hasNewline(Grammar g, prod(_, symbols, _)) {
-    set[Symbol] nonTerminals = {s | /Symbol s := symbols, isNonTerminalType(s)};
-    return any(/r: range(_, _) := symbols, hasNewline(r)) ||
-        any(s <- nonTerminals, Production p <- lookup(g, s), hasNewline(g, p));
+bool hasNewline(Grammar g, Production p) {
+    return hasNewlineByProduction(g)[p];
+}
+
+@memo
+private map[Production, bool] hasNewlineByProduction(Grammar g) {
+    map[Production, bool] ret = (p: false | /p: prod(_, _, _) := g);
+
+    solve (ret) {
+        for (p <- ret, !ret[p]) {
+            set[Symbol] nonTerminals = {s | /Symbol s := p.symbols, isNonTerminalType(s)};
+            ret[p] = ret[p] || any(/r: range(_, _) := p.symbols, hasNewline(r))
+                            || any(s <- nonTerminals, Production child <- lookup(g, s), ret[child]);
+        }
+    }
+
+    return ret;
 }
 
 @synopsis{

diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc
@@ -20,6 +20,7 @@ import ParseTree;
 import util::Maybe;
 
 import lang::rascal::grammar::Util;
+import util::MaybeUtil;
 
 @synopsis{
     Representation of a traversal direction along a list of symbols
@@ -112,16 +113,6 @@ private map[Symbol, Maybe[set[Symbol]]] followBySymbol(Grammar g, bool(Symbol) p
     return ret;
 }
 
-private set[Symbol] unmaybe(just(set[Symbol] \set))
-    = \set;
-private set[Symbol] unmaybe(nothing())
-    = {};
-
-private Maybe[set[Symbol]] union(just(set[Symbol] \set1), just(set[Symbol] \set2))
-    = just(\set1 + \set2);
-private default Maybe[set[Symbol]] union(Maybe[set[Symbol]] _, Maybe[set[Symbol]] _)
-    = nothing();
-
 @synopsis{
     Checks if symbol `s` is a terminal
 }

diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
@@ -215,11 +215,17 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
             // Simple case: each unit does have an `end` inner delimiter
             if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) {
 
-                // Compute a list of terminals that need to be consumed between
+                // Compute a list of segments that need to be consumed between
                 // the `begin` delimiter and the `end` delimiters. Each of these
-                // terminals will be converted to a match pattern.
-                list[Symbol] terminals = [*getTerminals(rsc, u.prod) | u <- group];
-                terminals = [s | s <- terminals, s notin begins && s notin ends];
+                // segments will be converted to a match pattern.
+                set[list[Symbol]] segments = {*getSegments(rsc, u.prod) | u <- group};
+
+                list[Symbol] terminals
+                    = [\seq([   *ys   ]) | [x, *ys, z] <- segments, x == begin, z    in ends]
+                    + [\seq([   *ys, z]) | [x, *ys, z] <- segments, x == begin, z notin ends]
+                    + [\seq([x, *ys   ]) | [x, *ys, z] <- segments, x != begin, z    in ends]
+                    + [\seq([x, *ys, z]) | [x, *ys, z] <- segments, x != begin, z notin ends];
+
                 terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
                 terminals = dup(terminals);
                 terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)

diff --git a/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc b/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc
@@ -0,0 +1,28 @@
+@synopsis{
+    Utility functions for `Maybe` values
+}
+
+module util::MaybeUtil
+
+import util::Maybe;
+
+@synopsis{
+    Returns the set of a `Maybe` value when present. Returns the empty set when
+    absent.
+}
+
+set[&T] unmaybe(Maybe[set[&T]] _: nothing())
+    = {};
+set[&T] unmaybe(Maybe[set[&T]] _: just(set[&T] \set))
+    = \set;
+
+@synopsis{
+    Returns just the union of the sets of two `Maybe` values when present.
+    Returns nothing if absent.
+}
+
+Maybe[set[&T]] union(just(set[&T] set1), just(set[&T] set2))
+    = just(set1 + set2);
+
+default Maybe[set[&T]] union(Maybe[set[&T]] _, Maybe[set[&T]] _)
+    = nothing();
diff --git a/vscode-extension/syntaxes/pico.tmLanguage.json b/vscode-extension/syntaxes/pico.tmLanguage.json
@@ -14,7 +14,7 @@
       "end": "(\\\")",
       "patterns": [
         {
-          "match": "((?:\\\")[\\x{01}-\\!\\#-\\x{10FFFF}]*?(?:\\\"))",
+          "match": "([\\x{01}-\\!\\#-\\x{10FFFF}]+?)",
           "captures": {
             "1": {
               "name": "string.quoted.double"

diff --git a/vscode-extension/syntaxes/rascal.tmLanguage.json b/vscode-extension/syntaxes/rascal.tmLanguage.json
@@ -221,7 +221,39 @@
       "end": "((?:\\\")|(?:\\<))",
       "patterns": [
         {
-          "match": "((?:(?:(?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])|[\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}]|(?:(?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))|(?:(?:(?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)a[0-7][0-9A-Fa-f])))+?)",
+          "match": "((?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -254,15 +286,39 @@
       "end": "((?:\\\")|(?:\\<))",
       "patterns": [
         {
-          "match": "((?:(?:(?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])|[\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}]|(?:(?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))|(?:(?:(?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)a[0-7][0-9A-Fa-f])))+?)",
+          "match": "((?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
             }
           }
         },
         {
-          "match": "((?:(?:(?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])|[\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}]|(?:(?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))|(?:(?:(?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)a[0-7][0-9A-Fa-f])))+?)",
+          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -477,7 +533,39 @@
       "end": "(\\')",
       "patterns": [
         {
-          "match": "((?:(?:(?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])|[\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}]|(?:(?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))|(?:(?:(?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)a[0-7][0-9A-Fa-f])))+?)",
+          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?)",
+          "captures": {
+            "1": {
+              "name": "string.quoted.single"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.single"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.single"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.single"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.single"