From dd9d24af6290e9f0f181f87834b688ad942f4923 Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Fri, 23 Aug 2024 16:09:20 +0200
Subject: [PATCH 01/16] Extend `destar` with cases for `\seq` and `\alt`

---
 .../src/main/rascal/lang/rascal/grammar/Util.rsc      | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc
index a538be6..7830b76 100644
--- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc
@@ -84,21 +84,26 @@ Symbol expand(\iter-star-seps(symbol, separators))
     Removes the label from symbol `s`, if any
 }
 
-Symbol delabel(label(_, Symbol s)) = s;
-default Symbol delabel(Symbol s)   = s;
+Symbol delabel(\label(_, Symbol s)) = delabel(s);
+default Symbol delabel(Symbol s)    = s;
 
 @synopsis{
     Removes operators `?` and `*` from symbol `s`, if any
 }
 
-Symbol destar(label(name, symbol))
+Symbol destar(\label(name, symbol))
     = label(name, destar(symbol));
+
 Symbol destar(\opt(symbol))
     = destar(symbol);
 Symbol destar(\iter-star(symbol))
     = \iter(destar(symbol));
 Symbol destar(\iter-star-seps(symbol, separators))
     = \iter-seps(destar(symbol), separators);
+Symbol destar(\seq([symbol]))
+    = \seq([destar(symbol)]);
+Symbol destar(\alt({symbol}))
+    = \alt({destar(symbol)});
 
 default Symbol destar(Symbol s) = s;
 

From 8155fb6618c0b0c32b51f9587e1be3b1fec4ed5c Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Fri, 23 Aug 2024 16:09:49 +0200
Subject: [PATCH 02/16] Add module `MaybeUtil`

---
 .../src/main/rascal/util/MaybeUtil.rsc        | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc

diff --git a/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc b/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc
new file mode 100644
index 0000000..bd34f33
--- /dev/null
+++ b/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc
@@ -0,0 +1,28 @@
+@synopsis{
+    Utility functions for `Maybe` values
+}
+
+module util::MaybeUtil
+
+import util::Maybe;
+
+@synopsis{
+    Returns the set of a `Maybe` value when present. Returns the empty set when
+    absent.
+}
+
+set[&T] unmaybe(Maybe[set[&T]] _: nothing())
+    = {};
+set[&T] unmaybe(Maybe[set[&T]] _: just(set[&T] \set))
+    = \set;
+
+@synopsis{
+    Returns just the union of the sets of two `Maybe` values when present.
+    Returns nothing if absent.
+}
+
+Maybe[set[&T]] union(just(set[&T] set1), just(set[&T] set2))
+    = just(set1 + set2);
+
+default Maybe[set[&T]] union(Maybe[set[&T]] _, Maybe[set[&T]] _)
+    = nothing();
\ No newline at end of file

From 689545cb4f7bdbce118be246afea406fcdfb9e0f Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Fri, 23 Aug 2024 16:10:57 +0200
Subject: [PATCH 03/16] Use new module `MaybeUtil` in existing code

---
 .../rascal/lang/rascal/grammar/analyze/Symbols.rsc    | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc
index a1b01b8..320252e 100644
--- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc
@@ -20,6 +20,7 @@ import ParseTree;
 import util::Maybe;
 
 import lang::rascal::grammar::Util;
+import util::MaybeUtil;
 
 @synopsis{
     Representation of a traversal direction along a list of symbols
@@ -112,16 +113,6 @@ private map[Symbol, Maybe[set[Symbol]]] followBySymbol(Grammar g, bool(Symbol) p
     return ret;
 }
 
-private set[Symbol] unmaybe(just(set[Symbol] \set))
-    = \set;
-private set[Symbol] unmaybe(nothing())
-    = {};
-
-private Maybe[set[Symbol]] union(just(set[Symbol] \set1), just(set[Symbol] \set2))
-    = just(\set1 + \set2);
-private default Maybe[set[Symbol]] union(Maybe[set[Symbol]] _, Maybe[set[Symbol]] _)
-    = nothing();
-
 @synopsis{
     Checks if symbol `s` is a terminal
 }

From c464799d0110b8128c4f082afa2f7291b4e757a7 Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Fri, 23 Aug 2024 16:17:22 +0200
Subject: [PATCH 04/16] Add function to compute the newline-separated segments
 of a list of symbols

---
 .../lang/rascal/grammar/analyze/Newlines.rsc  | 120 +++++++++++++++++-
 1 file changed, 116 insertions(+), 4 deletions(-)

diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc
index 6bdb3ca..c0f5d6d 100644
--- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc
@@ -7,17 +7,129 @@ module lang::rascal::grammar::analyze::Newlines
 import Grammar;
 import ParseTree;
 import String;
+import util::Maybe;
 
 import lang::rascal::grammar::Util;
+import util::MaybeUtil;
+
+@synopsis{
+    Representation of a *newline-free* segment of symbols
+}
+
+alias Segment = list[Symbol];
+
+@synopsis{
+    Gets the (newline-free) segments of a production/list of symbols in grammar
+    `g`, separated by symbols that have a newline (not part of any segment),
+    recursively for non-terminals. For instance, the segments of
+    `[lit("foo"), lit("bar"), lit("\n"), lit("baz")]` are:
+      - `[lit("foo"), lit("bar")]`;
+      - `[lit("baz")]`.
+}
+
+set[Segment] getSegments(Grammar g, Production p) {
+    return unmaybe(getSegmentsByProduction(g)[p]);
+}
+
+set[Segment] getSegments(Grammar g, list[Symbol] symbols) {
+    map[Production, Maybe[set[Segment]]] env = getSegmentsByProduction(g);
+    return unmaybe(getSegmentsWithEnvironment(g, symbols, env));
+}
+
+@memo
+private map[Production, Maybe[set[Segment]]] getSegmentsByProduction(Grammar g) {
+    map[Production, Maybe[set[Segment]]] ret = (p : nothing() | /p: prod(_, _, _) := g);
+
+    solve (ret) {
+        for (p <- ret, nothing() == ret[p]) {
+            ret[p] = getSegmentsWithEnvironment(g, p.symbols, ret);
+        }
+    }
+
+    return ret;
+}
+
+private Maybe[set[Segment]] getSegmentsWithEnvironment(
+        Grammar g, list[Symbol] symbols, 
+        map[Production, Maybe[set[Segment]]] env) {
+
+    // General idea: Recursively traverse `symbols` from left to right, while
+    // keeping track of a "running segment" (initially empty). Each time a
+    // symbol that has a newline is encountered, finish/collect the running
+    // segment, and start a new one for the remainder of `symbols`.
+
+    // Final case: No symbols remaining
+    Maybe[set[Segment]] get(Segment runningSegment, []) {
+        return just(_ <- runningSegment ? {runningSegment} : {});
+    }
+
+    // Recursive case: At least one symbol remaining
+    Maybe[set[Segment]] get(Segment segment, [Symbol head, *Symbol tail]) {
+        set[Symbol] nested = {s | /Symbol s := head};
+        
+        // If the head contains a non-terminal, then: (1) finish the running
+        // segment; (2) lookup the segments of the non-terminals in the
+        // environment, if any; (3) compute the segments of the tail. Return the
+        // union of 1-3.
+        if (any(s <- nested, isNonTerminalType(s))) {
+
+            list[Maybe[set[Segment]]] sets
+                = [get(segment, [])] // (1)
+                + [env[p] | s <- nested, isNonTerminalType(s), p <- lookup(g, s)] // (2)
+                + [get([], tail)]; // (3)
+            
+            return (sets[0] | union(it, \set) | \set <- sets[1..]);
+        
+        }
+        
+        // If the head doesn't contain a non-terminal, but it has a newline,
+        // then: (1) finish the running segment; (2) compute the segments of the
+        // tail. Return the union of 1-2. Note: the head is ignored and won't be
+        // part of any segment.
+        else if (any(s <- nested, hasNewline(g, s))) {
+            return union(get(segment, []), get([], tail));
+        }
+        
+        // If the head doesn't contain a non-terminal, and if it doesn't have a
+        // newline, then add the head to the running segment and proceed with
+        // the tail.
+        else {
+            return get(segment + head, tail);
+        }
+    }
+
+    return get([], symbols);
+}
+
+@synopsis{
+    Checks if a symbol has a newline character
+}
+
+bool hasNewline(Grammar g, Symbol s) {
+    return any(p <- lookup(g, delabel(s)), hasNewline(g, p));
+}
 
 @synopsis{
     Checks if a production has a newline character
 }
 
-bool hasNewline(Grammar g, prod(_, symbols, _)) {
-    set[Symbol] nonTerminals = {s | /Symbol s := symbols, isNonTerminalType(s)};
-    return any(/r: range(_, _) := symbols, hasNewline(r)) ||
-        any(s <- nonTerminals, Production p <- lookup(g, s), hasNewline(g, p));
+bool hasNewline(Grammar g, Production p) {
+    return hasNewlineByProduction(g)[p];
+}
+
+@memo
+private map[Production, bool] hasNewlineByProduction(Grammar g) {
+    map[Production, bool] ret = (p: false | /p: prod(_, _, _) := g);
+
+    solve (ret) {
+        for (p <- ret, !ret[p]) {
+            set[Symbol] nonTerminals = {s | /Symbol s := p.symbols, isNonTerminalType(s)};
+            ret[p] = ret[p] || any(/r: range(_, _) := p.symbols, hasNewline(r))
+                            || any(s <- nonTerminals, Production child <- lookup(g, s), ret[child]);
+        }
+    }
+
+    return ret;
 }
 
 @synopsis{

From 0c3b83bf54008c05a7cdcf164ead9899a0d3a95b Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Fri, 23 Aug 2024 16:20:52 +0200
Subject: [PATCH 05/16] Use segments (instead of terminals) in the generation
 of begin/end patterns

---
 .../src/main/rascal/lang/rascal/grammar/Util.rsc   |  9 ---------
 .../src/main/rascal/lang/textmate/Conversion.rsc   | 14 ++++++++++----
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc
index 7830b76..a7ad3ad 100644
--- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc
@@ -31,15 +31,6 @@ bool tryParse(Grammar g, Symbol s, str input, bool allowAmbiguity = false) {
     return false;
 }
 
-@synopsis{
-    Gets the terminals that occur in production `p`, possibly recursively
-    (default: `true`)
-}
-
-set[Symbol] getTerminals(Grammar g, Production p, bool recur = true)
-    = {s | s <- p.symbols, !isNonTerminalType(s)}
-    + {*getTerminals(g, child) | recur, s <- p.symbols, child <- lookup(g, s)};
-
 @synopsis{
     Lookups a list of productions for symbol `s` in grammar `g`, replacing
     formal parameters with actual parameters when needed
diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
index 2bab444..e6b28c9 100644
--- a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
@@ -215,11 +215,17 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
             // Simple case: each unit does have an `end` inner delimiter
             if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) {
 
-                // Compute a list of terminals that need to be consumed between
+                // Compute a list of segments that need to be consumed between
                 // the `begin` delimiter and the `end` delimiters. Each of these
-                // terminals will be converted to a match pattern.
-                list[Symbol] terminals = [*getTerminals(rsc, u.prod) | u <- group];
-                terminals = [s | s <- terminals, s notin begins && s notin ends];
+                // segments will be converted to a match pattern.
+                set[list[Symbol]] segments = {*getSegments(rsc, u.prod) | u <- group};
+
+                list[Symbol] terminals
+                    = [\seq([   *ys   ]) | [x, *ys, z] <- segments, x == begin, z    in ends]
+                    + [\seq([   *ys, z]) | [x, *ys, z] <- segments, x == begin, z notin ends]
+                    + [\seq([x, *ys   ]) | [x, *ys, z] <- segments, x != begin, z    in ends]
+                    + [\seq([x, *ys, z]) | [x, *ys, z] <- segments, x != begin, z notin ends];
+
                 terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
                 terminals = dup(terminals);
                 terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)

From 91b7ca9f98dca90b9b305c43b1107785a97c0ad0 Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Fri, 23 Aug 2024 16:21:23 +0200
Subject: [PATCH 06/16] Update generated TextMate grammar for Rascal/Pico

---
 .../syntaxes/pico.tmLanguage.json             |  2 +-
 .../syntaxes/rascal.tmLanguage.json           | 96 ++++++++++++++++++-
 2 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/vscode-extension/syntaxes/pico.tmLanguage.json b/vscode-extension/syntaxes/pico.tmLanguage.json
index 2a57152..dda2f4d 100644
--- a/vscode-extension/syntaxes/pico.tmLanguage.json
+++ b/vscode-extension/syntaxes/pico.tmLanguage.json
@@ -14,7 +14,7 @@
       "end": "(\\\")",
       "patterns": [
         {
-          "match": "((?:\\\")[\\x{01}-\\!\\#-\\x{10FFFF}]*?(?:\\\"))",
+          "match": "([\\x{01}-\\!\\#-\\x{10FFFF}]+?)",
           "captures": {
             "1": {
               "name": "string.quoted.double"
diff --git a/vscode-extension/syntaxes/rascal.tmLanguage.json b/vscode-extension/syntaxes/rascal.tmLanguage.json
index 1216e9b..7f37239 100644
--- a/vscode-extension/syntaxes/rascal.tmLanguage.json
+++ b/vscode-extension/syntaxes/rascal.tmLanguage.json
@@ -221,7 +221,39 @@
       "end": "((?:\\\")|(?:\\<))",
       "patterns": [
         {
-          "match": "((?:(?:(?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])|[\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}]|(?:(?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))|(?:(?:(?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)a[0-7][0-9A-Fa-f])))+?)",
+          "match": "((?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -254,7 +286,7 @@
       "end": "((?:\\\")|(?:\\<))",
       "patterns": [
         {
-          "match": "((?:(?:(?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])|[\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}]|(?:(?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))|(?:(?:(?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)a[0-7][0-9A-Fa-f])))+?)",
+          "match": "((?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -262,7 +294,31 @@
           }
         },
         {
-          "match": "((?:(?:(?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])|[\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}]|(?:(?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))|(?:(?:(?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)a[0-7][0-9A-Fa-f])))+?)",
+          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -477,7 +533,39 @@
       "end": "(\\')",
       "patterns": [
         {
-          "match": "((?:(?:(?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])|[\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}]|(?:(?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))|(?:(?:(?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)a[0-7][0-9A-Fa-f])))+?)",
+          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?)",
+          "captures": {
+            "1": {
+              "name": "string.quoted.single"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.single"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.single"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
+          "captures": {
+            "1": {
+              "name": "string.quoted.single"
+            }
+          }
+        },
+        {
+          "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.single"

From 7951a7ed3b1de648618ed14a8bc0e2ab6da92392 Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Mon, 2 Sep 2024 11:27:16 +0200
Subject: [PATCH 07/16] Add a few clarifying comments

---
 .../src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc | 5 ++---
 .../src/main/rascal/lang/textmate/Conversion.rsc             | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc
index c0f5d6d..e864dda 100644
--- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc
@@ -79,13 +79,12 @@ private Maybe[set[Segment]] getSegmentsWithEnvironment(
                 + [get([], tail)]; // (3)
             
             return (sets[0] | union(it, \set) | \set <- sets[1..]);
-        
         }
         
         // If the head doesn't contain a non-terminal, but it has a newline,
         // then: (1) finish the running segment; (2) compute the segments of the
-        // tail. Return the union of 1-2. Note: the head is ignored and won't be
-        // part of any segment.
+        // tail. Return the union of 1-2. Note: the head, as it has a newline,
+        // is ignored and won't be part of any segment.
         else if (any(s <- nested, hasNewline(g, s))) {
             return union(get(segment, []), get([], tail));
         }
diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
index e6b28c9..1d6922f 100644
--- a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
@@ -220,7 +220,7 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
                 // segments will be converted to a match pattern.
                 set[list[Symbol]] segments = {*getSegments(rsc, u.prod) | u <- group};
 
-                list[Symbol] terminals
+                list[Symbol] terminals // Remove `begin` and `end` from the segments
                     = [\seq([   *ys   ]) | [x, *ys, z] <- segments, x == begin, z    in ends]
                     + [\seq([   *ys, z]) | [x, *ys, z] <- segments, x == begin, z notin ends]
                     + [\seq([x, *ys   ]) | [x, *ys, z] <- segments, x != begin, z    in ends]

From d805b5a11db5ea4b95c13a64a697243578059f35 Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Mon, 2 Sep 2024 12:01:20 +0200
Subject: [PATCH 08/16] Add tests

---
 .../textmate/conversiontests/PicoWithCategories.rsc |  8 ++++++--
 .../conversiontests/PicoWithCategories.test         | 13 ++++++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc
index 36db90d..9a81068 100644
--- a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc
@@ -44,7 +44,11 @@ syntax Expression
            
 lexical Id = ([a-z][a-z0-9]*) !>> [a-z0-9] \ Keyword;
 lexical Natural = [0-9]+ !>> [0-9];
-lexical String = "\"" ![\"]* "\"";
+lexical String = "\"" Char* "\"";
+
+lexical Char
+    = ![\"]
+    | "\\" [\"];
 
 keyword Keyword
     = "begin"
@@ -70,7 +74,7 @@ lexical WhitespaceAndComment
 Grammar rsc = preprocess(grammar(#Program));
 
 list[ConversionUnit] units = [
-    unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":=")})],{}), false, <nothing(),nothing()>, <nothing(),nothing()>),
+    unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":="),lit("\\")})],{}), false, <nothing(),nothing()>, <nothing(),nothing()>),
     unit(rsc, prod(label("natural",sort("Type")),[lit("natural")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
     unit(rsc, prod(label("nil",sort("Type")),[lit("nil-type")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <just(lit("nil-type")),just(lit("nil-type"))>),
     unit(rsc, prod(label("string",sort("Type")),[lit("string")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test
index 8a4aabe..a5a6459 100644
--- a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test
+++ b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test
@@ -48,9 +48,16 @@
 #      ^      -string.quoted.double
 #       ^^^^^ string.quoted.double
 
-  "foo\" bar
-# ^^^^^^     string.quoted.double
-#       ^^^^ -string.quoted.double
+  "foo" bar
+# ^^^^^     string.quoted.double
+#      ^^^^ -string.quoted.double
+
+  "foo\" bar"
+# ^^^^^^^^^^^ string.quoted.double
+
+  "foo\\" bar
+# ^^^^^^^     string.quoted.double
+#        ^^^^ -string.quoted.double
 
   "foo
 # ^^^^ string.quoted.double

From 40a4712280ccd47e0d9c678f927163477c88ae6b Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Mon, 2 Sep 2024 14:23:54 +0200
Subject: [PATCH 09/16] Add another test and fix a bug to make the test pass

---
 .../lang/rascal/grammar/analyze/Newlines.rsc  | 53 ++++++++++++++-----
 .../main/rascal/lang/textmate/Conversion.rsc  | 24 ++++++---
 .../conversiontests/PicoWithCategories.test   |  7 +++
 3 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc
index e864dda..b4e0d0b 100644
--- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc
@@ -13,10 +13,15 @@ import lang::rascal::grammar::Util;
 import util::MaybeUtil;
 
 @synopsis{
-    Representation of a *newline-free* segment of symbols
+    Representation of a *newline-free* segment of symbols. A segment is
+    *initial* when it occurs first in a production/list of symbols; it is
+    *final* when it occurs last.
 }
 
-alias Segment = list[Symbol];
+data Segment = segment(
+    list[Symbol] symbols,
+    bool initial = false,
+    bool final = false);
 
 @synopsis{
     Gets the (newline-free) segments of a production/list of symbols in grammar
@@ -58,26 +63,44 @@ private Maybe[set[Segment]] getSegmentsWithEnvironment(
     // symbol that has a newline is encountered, finish/collect the running
     // segment, and start a new one for the remainder of `symbols`.
 
-    // Final case: No symbols remaining
-    Maybe[set[Segment]] get(Segment runningSegment, []) {
-        return just(_ <- runningSegment ? {runningSegment} : {});
+    // Base case: No symbols remaining
+    Maybe[set[Segment]] get(Segment running, [], bool final = true) {
+        return just(_ <- running.symbols ? {running[final = final]} : {});
     }
 
     // Recursive case: At least one symbol remaining
-    Maybe[set[Segment]] get(Segment segment, [Symbol head, *Symbol tail]) {
+    Maybe[set[Segment]] get(Segment running, [Symbol head, *Symbol tail]) {
         set[Symbol] nested = {s | /Symbol s := head};
+
+        Maybe[set[Segment]] finished = get(running, [], final = tail == []);
         
         // If the head contains a non-terminal, then: (1) finish the running
         // segment; (2) lookup the segments of the non-terminals in the
         // environment, if any; (3) compute the segments of the tail. Return the
         // union of 1-3.
         if (any(s <- nested, isNonTerminalType(s))) {
+            list[Maybe[set[Segment]]] sets = [];
+
+            // (1)
+            sets += finished;
+
+            // (2)
+            sets += for (s <- nested, isNonTerminalType(s), p <- lookup(g, s)) {
+
+                bool isInitial(Segment seg)
+                    = seg.initial && running.initial && running.symbols == [];
+                bool isFinal(Segment seg)
+                    = seg.final && tail == [];
+                Segment update(Segment seg)
+                    = seg[initial = isInitial(seg)][final = isFinal(seg)];
+                
+                append just(segs) := env[p] ? just({update(seg) | seg <- segs}) : nothing();
+            }
+
+            // (3)
+            sets += get(segment([]), tail);
 
-            list[Maybe[set[Segment]]] sets
-                = [get(segment, [])] // (1)
-                + [env[p] | s <- nested, isNonTerminalType(s), p <- lookup(g, s)] // (2)
-                + [get([], tail)]; // (3)
-            
+            // Return union
             return (sets[0] | union(it, \set) | \set <- sets[1..]);
         }
         
@@ -86,18 +109,20 @@ private Maybe[set[Segment]] getSegmentsWithEnvironment(
         // tail. Return the union of 1-2. Note: the head, as it has a newline,
         // is ignored and won't be part of any segment.
         else if (any(s <- nested, hasNewline(g, s))) {
-            return union(get(segment, []), get([], tail));
+            return union(finished, get(segment([]), tail));
         }
         
         // If the head doesn't contain a non-terminal, and if it doesn't have a
         // newline, then add the head to the running segment and proceed with
         // the tail.
         else {
-            return get(segment + head, tail);
+            Segment old = running;
+            Segment new = old[symbols = old.symbols + head]; 
+            return get(new, tail);
         }
     }
 
-    return get([], symbols);
+    return get(segment([], initial = true), symbols);
 }
 
 @synopsis{
diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
index 1d6922f..81b19b0 100644
--- a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
@@ -218,14 +218,22 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
                 // Compute a list of segments that need to be consumed between
                 // the `begin` delimiter and the `end` delimiters. Each of these
                 // segments will be converted to a match pattern.
-                set[list[Symbol]] segments = {*getSegments(rsc, u.prod) | u <- group};
-
-                list[Symbol] terminals // Remove `begin` and `end` from the segments
-                    = [\seq([   *ys   ]) | [x, *ys, z] <- segments, x == begin, z    in ends]
-                    + [\seq([   *ys, z]) | [x, *ys, z] <- segments, x == begin, z notin ends]
-                    + [\seq([x, *ys   ]) | [x, *ys, z] <- segments, x != begin, z    in ends]
-                    + [\seq([x, *ys, z]) | [x, *ys, z] <- segments, x != begin, z notin ends];
-
+                list[Segment] segments = [*getSegments(rsc, u.prod) | u <- group];
+                
+                Segment removeBeginEnd(Segment seg) {
+                    list[Symbol] symbols = seg.symbols;
+                    if (seg.initial, _ <- symbols, symbols[0] == begin) {
+                        symbols = symbols[1..];
+                    }
+                    if (seg.final, _ <- symbols, symbols[-1] in ends) {
+                        symbols = symbols[..-1];
+                    }
+                    
+                    return seg[symbols = symbols];
+                }
+
+                list[Symbol] terminals = [\seq(removeBeginEnd(seg).symbols) | seg <- segments];
+                terminals = [s | s <- terminals, [] != s.symbols];
                 terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
                 terminals = dup(terminals);
                 terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test
index a5a6459..df5915e 100644
--- a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test
+++ b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test
@@ -62,4 +62,11 @@
   "foo
 # ^^^^ string.quoted.double
   bar"
+# ^^^^ string.quoted.double
+
+  "foo
+# ^^^^ string.quoted.double
+  \"
+# ^^   string.quoted.double
+  bar"
 # ^^^^ string.quoted.double
\ No newline at end of file

From 49547a4622dce01720eafa09d5571e3c5bb0afc6 Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Tue, 3 Sep 2024 08:56:24 +0200
Subject: [PATCH 10/16] Update generated TextMate grammar for Rascal/Pico

---
 .../syntaxes/pico.tmLanguage.json             | 14 ++++++--
 .../syntaxes/rascal.tmLanguage.json           | 32 ++++++++++++++++---
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/vscode-extension/syntaxes/pico.tmLanguage.json b/vscode-extension/syntaxes/pico.tmLanguage.json
index dda2f4d..a489f46 100644
--- a/vscode-extension/syntaxes/pico.tmLanguage.json
+++ b/vscode-extension/syntaxes/pico.tmLanguage.json
@@ -14,7 +14,15 @@
       "end": "(\\\")",
       "patterns": [
         {
-          "match": "([\\x{01}-\\!\\#-\\x{10FFFF}]+?)",
+          "match": "((?:\\\\)(?:\\\"))",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
+        {
+          "match": "([\\x{01}-\\!\\#-\\x{10FFFF}])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -52,7 +60,7 @@
       }
     },
     "/inner/single/$delimiters": {
-      "match": "(?:(?:\\-)|(?:\\,)|(?:\\))|(?:\\()|(?:\\+)|(?:\\|\\|)|(?:\\:\\=))",
+      "match": "(?:(?:\\-)|(?:\\,)|(?:\\))|(?:\\()|(?:\\+)|(?:\\|\\|)|(?:\\:\\=)|(?:\\\\))",
       "name": "/inner/single/$delimiters",
       "captures": {}
     },
@@ -189,7 +197,7 @@
       }
     },
     "/inner/single/expression.strcon": {
-      "match": "((?<=(?:[\\t-\\n\\r\\x{20}\\%]|(?:(?:^))))(?:(?:\\\")[\\x{01}-\\!\\#-\\x{10FFFF}]*?(?:\\\")))",
+      "match": "((?<=(?:[\\t-\\n\\r\\x{20}\\%]|(?:(?:^))))(?:(?:\\\")(?:(?:(?:\\\\)(?:\\\"))|[\\x{01}-\\!\\#-\\x{10FFFF}])*?(?:\\\")))",
       "name": "/inner/single/expression.strcon",
       "captures": {
         "1": {
diff --git a/vscode-extension/syntaxes/rascal.tmLanguage.json b/vscode-extension/syntaxes/rascal.tmLanguage.json
index 7f37239..4095f9d 100644
--- a/vscode-extension/syntaxes/rascal.tmLanguage.json
+++ b/vscode-extension/syntaxes/rascal.tmLanguage.json
@@ -229,7 +229,7 @@
           }
         },
         {
-          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))",
+          "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -244,6 +244,14 @@
             }
           }
         },
+        {
+          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
         {
           "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
           "captures": {
@@ -294,7 +302,7 @@
           }
         },
         {
-          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))",
+          "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -309,6 +317,14 @@
             }
           }
         },
+        {
+          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))",
+          "captures": {
+            "1": {
+              "name": "string.quoted.double"
+            }
+          }
+        },
         {
           "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
           "captures": {
@@ -533,7 +549,7 @@
       "end": "(\\')",
       "patterns": [
         {
-          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?)",
+          "match": "((?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.single"
@@ -541,7 +557,7 @@
           }
         },
         {
-          "match": "((?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
+          "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])",
           "captures": {
             "1": {
               "name": "string.quoted.single"
@@ -556,6 +572,14 @@
             }
           }
         },
+        {
+          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))",
+          "captures": {
+            "1": {
+              "name": "string.quoted.single"
+            }
+          }
+        },
         {
           "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
           "captures": {

From 05eb324fba5703fcbe54078bf977ec0ffb10b856 Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Fri, 6 Sep 2024 11:41:10 +0200
Subject: [PATCH 11/16] Add utility functions to compute the expected min/max
 length of terminals

---
 .../lang/rascal/grammar/analyze/Symbols.rsc   | 60 ++++++++++++++++++-
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc
index 320252e..27ba177 100644
--- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc
@@ -17,6 +17,8 @@ module lang::rascal::grammar::analyze::Symbols
 
 import Grammar;
 import ParseTree;
+import String;
+import util::Math;
 import util::Maybe;
 
 import lang::rascal::grammar::Util;
@@ -56,9 +58,9 @@ private map[Symbol, Maybe[set[Symbol]]] firstBySymbol(Grammar g, bool(Symbol) pr
 
     Maybe[set[Symbol]] firstOf([])
         = just({});
-    Maybe[set[Symbol]] firstOf([h, *t])
+    Maybe[set[Symbol]] firstOf([Symbol h, *Symbol t])
         = \set: just({\empty(), *_}) := ret[delabel(h)]
-        ? union(\set, firstOf(t))
+        ? util::MaybeUtil::union(\set, firstOf(t))
         : ret[delabel(h)];
 
     solve (ret) {
@@ -118,4 +120,56 @@ private map[Symbol, Maybe[set[Symbol]]] followBySymbol(Grammar g, bool(Symbol) p
 }
 
 bool isTerminal(Symbol s)
-    = !isNonTerminalType(s);
\ No newline at end of file
+    = !isNonTerminalType(s);
+
+@synposis{
+    Sorts list of terminals `symbols` by minimum length (in ascending order)
+}
+
+list[Symbol] sortByMinimumLength(list[Symbol] symbols) {
+    bool less(Symbol s1, Symbol s2) = length(s1).min < length(s2).min;
+    return sort(symbols, less);
+}
+
+@synopsis{
+    Representation of the minimum length and the maximum length of the text
+    produced by a symbol. If `max` is `nothing()`, then the text produced is
+    statically unbounded.
+}
+
+alias Range = tuple[int min, Maybe[int] max];
+
+private Range ZERO = <0, just(0)>;
+private Range seq(Range r1, Range r2) = <r1.min + r2.min, add(r1.max, r2.max)>;
+private Range alt(Range r1, Range r2) = <min(r1.min, r2.min), max(r1.max, r2.max)>;
+
+private Maybe[int] add(just(int i), just(int j)) = just(i + j);
+private default Maybe[int] add(Maybe[int] _, Maybe[int] _) = nothing();
+
+private Maybe[int] max(just(int i), just(int j)) = just(max(i, j));
+private default Maybe[int] max(Maybe[int] _, Maybe[int] _) = nothing();
+
+@synopsis{
+    Computes the length of a terminal symbol as a range
+}
+
+Range length(\lit(string))   = <size(string), just(size(string))>;
+Range length(\cilit(string)) = <size(string), just(size(string))>;
+Range length(\char-class(_)) = <1, just(1)>;
+
+Range length(\empty())                   = ZERO;
+Range length(\opt(symbol))               = length(symbol)[min = 0];
+Range length(\iter(symbol))              = length(symbol)[max = issue2007];
+Range length(\iter-star(symbol))         = <0, max: just(0) := length(symbol).max ? max : nothing()>;
+Range length(\iter-seps(symbol, _))      = length(symbol)[max = issue2007];
+Range length(\iter-star-seps(symbol, _)) = <0, max: just(0) := length(symbol).max ? max : nothing()>;
+Range length(\alt(alternatives))         = {Symbol first, *Symbol rest} := alternatives
+                                         ? (length(first) | alt(it, length(s)) | s <- rest)
+                                         : ZERO;
+Range length(\seq(symbols))              = (ZERO | seq(it, length(s)) | s <- symbols);
+
+Range length(\conditional(symbol, _)) = length(symbol);
+
+// TODO: Remove this workaround when issue #2007 is fixed:
+//   - https://github.com/usethesource/rascal/issues/2007
+private Maybe[int] issue2007 = nothing();
\ No newline at end of file

From e869ffd9aa91ad0f306fdaf6f56e7e153a7b3a69 Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Fri, 6 Sep 2024 11:47:08 +0200
Subject: [PATCH 12/16] Add sorting for terminals-to-gobble (based on segments)

---
 .../src/main/rascal/lang/textmate/Conversion.rsc               | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
index 81b19b0..03ea76f 100644
--- a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
@@ -16,6 +16,7 @@ import lang::rascal::grammar::Util;
 import lang::rascal::grammar::analyze::Delimiters;
 import lang::rascal::grammar::analyze::Dependencies;
 import lang::rascal::grammar::analyze::Newlines;
+import lang::rascal::grammar::analyze::Symbols;
 import lang::textmate::ConversionConstants;
 import lang::textmate::ConversionUnit;
 import lang::textmate::Grammar;
@@ -236,6 +237,8 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
                 terminals = [s | s <- terminals, [] != s.symbols];
                 terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
                 terminals = dup(terminals);
+                terminals = sortByMinimumLength(terminals); // Small symbols first
+                terminals = reverse(terminals); // Large symbols first
                 terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
                 
                 TmRule r = toTmRule(

From 5603cc861ba6c12a532f91e63773a9e0580c725a Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Fri, 6 Sep 2024 11:49:11 +0200
Subject: [PATCH 13/16] Move `removeBeginEnd` to a separate private function to
 improve readability

---
 .../main/rascal/lang/textmate/Conversion.rsc  | 31 ++++++++++---------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
index 03ea76f..f884c2a 100644
--- a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
@@ -219,21 +219,10 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
                 // Compute a list of segments that need to be consumed between
                 // the `begin` delimiter and the `end` delimiters. Each of these
                 // segments will be converted to a match pattern.
-                list[Segment] segments = [*getSegments(rsc, u.prod) | u <- group];
-                
-                Segment removeBeginEnd(Segment seg) {
-                    list[Symbol] symbols = seg.symbols;
-                    if (seg.initial, _ <- symbols, symbols[0] == begin) {
-                        symbols = symbols[1..];
-                    }
-                    if (seg.final, _ <- symbols, symbols[-1] in ends) {
-                        symbols = symbols[..-1];
-                    }
-                    
-                    return seg[symbols = symbols];
-                }
-
-                list[Symbol] terminals = [\seq(removeBeginEnd(seg).symbols) | seg <- segments];
+                set[Segment] segs = {*getSegments(rsc, u.prod) | u <- group};
+                segs = {removeBeginEnd(seg, begins, ends) | seg <- segs};
+
+                list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
                 terminals = [s | s <- terminals, [] != s.symbols];
                 terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
                 terminals = dup(terminals);
@@ -305,6 +294,18 @@ private list[ConversionUnit] addOuterRules(list[ConversionUnit] units) {
     // precision than a unit-driven approach; I suspect it might.
 }
 
+private Segment removeBeginEnd(Segment seg, set[Symbol] begins, set[Symbol] ends) {
+    list[Symbol] symbols = seg.symbols;
+    if (seg.initial, _ <- symbols, symbols[0] in begins) {
+        symbols = symbols[1..];
+    }
+    if (seg.final, _ <- symbols, symbols[-1] in ends) {
+        symbols = symbols[..-1];
+    }
+    
+    return seg[symbols = symbols];
+}
+
 // TODO: This function could be moved to a separate, generic module
 private list[&T] dupLast(list[&T] l)
     = reverse(dup(reverse(l))); // TODO: Optimize/avoid `reverse`-ing?

From dafaa08014c7626a0b0a6ff5604b9e8591c86453 Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Fri, 6 Sep 2024 11:50:48 +0200
Subject: [PATCH 14/16] Fix small issue in the escaping rules for strings in
 `PicoWithCategories`

---
 .../textmate/conversiontests/PicoWithCategories.rsc  |  4 ++--
 .../textmate/conversiontests/PicoWithCategories.test | 12 +++++++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc
index 9a81068..6737a91 100644
--- a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc
+++ b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc
@@ -47,8 +47,8 @@ lexical Natural = [0-9]+ !>> [0-9];
 lexical String = "\"" Char* "\"";
 
 lexical Char
-    = ![\"]
-    | "\\" [\"];
+    = ![\\\"]
+    | "\\" [\\\"];
 
 keyword Keyword
     = "begin"
diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test
index df5915e..8b33821 100644
--- a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test
+++ b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test
@@ -64,9 +64,19 @@
   bar"
 # ^^^^ string.quoted.double
 
+  "foo\"
+# ^^^^^^ string.quoted.double
+  bar"
+# ^^^^ string.quoted.double
+
   "foo
 # ^^^^ string.quoted.double
   \"
 # ^^   string.quoted.double
   bar"
-# ^^^^ string.quoted.double
\ No newline at end of file
+# ^^^^ string.quoted.double
+
+  "foo
+# ^^^^ string.quoted.double
+  \"bar"
+# ^^^^^^ string.quoted.double
\ No newline at end of file

From ed8b6b2952b23d7ad7f07636f8fe71b16d61a7e3 Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Fri, 6 Sep 2024 12:25:40 +0200
Subject: [PATCH 15/16] Update generated TextMate grammar for Rascal/Pico

---
 .../syntaxes/pico.tmLanguage.json             |  6 ++---
 .../syntaxes/rascal.tmLanguage.json           | 26 +++++++++----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/vscode-extension/syntaxes/pico.tmLanguage.json b/vscode-extension/syntaxes/pico.tmLanguage.json
index a489f46..527e06e 100644
--- a/vscode-extension/syntaxes/pico.tmLanguage.json
+++ b/vscode-extension/syntaxes/pico.tmLanguage.json
@@ -14,7 +14,7 @@
       "end": "(\\\")",
       "patterns": [
         {
-          "match": "((?:\\\\)(?:\\\"))",
+          "match": "((?:\\\\)[\\\"\\\\])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -22,7 +22,7 @@
           }
         },
         {
-          "match": "([\\x{01}-\\!\\#-\\x{10FFFF}])",
+          "match": "([\\x{01}-\\!\\#-\\[\\]-\\x{10FFFF}])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -197,7 +197,7 @@
       }
     },
     "/inner/single/expression.strcon": {
-      "match": "((?<=(?:[\\t-\\n\\r\\x{20}\\%]|(?:(?:^))))(?:(?:\\\")(?:(?:(?:\\\\)(?:\\\"))|[\\x{01}-\\!\\#-\\x{10FFFF}])*?(?:\\\")))",
+      "match": "((?<=(?:[\\t-\\n\\r\\x{20}\\%]|(?:(?:^))))(?:(?:\\\")(?:(?:(?:\\\\)[\\\"\\\\])|[\\x{01}-\\!\\#-\\[\\]-\\x{10FFFF}])*?(?:\\\")))",
       "name": "/inner/single/expression.strcon",
       "captures": {
         "1": {
diff --git a/vscode-extension/syntaxes/rascal.tmLanguage.json b/vscode-extension/syntaxes/rascal.tmLanguage.json
index 4095f9d..c462663 100644
--- a/vscode-extension/syntaxes/rascal.tmLanguage.json
+++ b/vscode-extension/syntaxes/rascal.tmLanguage.json
@@ -229,7 +229,7 @@
           }
         },
         {
-          "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])",
+          "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -237,7 +237,7 @@
           }
         },
         {
-          "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])",
+          "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -253,7 +253,7 @@
           }
         },
         {
-          "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
+          "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -261,7 +261,7 @@
           }
         },
         {
-          "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
+          "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -302,7 +302,7 @@
           }
         },
         {
-          "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])",
+          "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -310,7 +310,7 @@
           }
         },
         {
-          "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])",
+          "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -318,7 +318,7 @@
           }
         },
         {
-          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))",
+          "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -326,7 +326,7 @@
           }
         },
         {
-          "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
+          "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -334,7 +334,7 @@
           }
         },
         {
-          "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
+          "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])",
           "captures": {
             "1": {
               "name": "string.quoted.double"
@@ -557,7 +557,7 @@
           }
         },
         {
-          "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])",
+          "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.single"
@@ -565,7 +565,7 @@
           }
         },
         {
-          "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])",
+          "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
           "captures": {
             "1": {
               "name": "string.quoted.single"
@@ -581,7 +581,7 @@
           }
         },
         {
-          "match": "((?:\\\\)a[0-7][0-9A-Fa-f])",
+          "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])",
           "captures": {
             "1": {
               "name": "string.quoted.single"
@@ -589,7 +589,7 @@
           }
         },
         {
-          "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])",
+          "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])",
           "captures": {
             "1": {
               "name": "string.quoted.single"

From 27c26ef2e69fdd834cd1de85d49a99e642a64868 Mon Sep 17 00:00:00 2001
From: Sung-Shik Jongmans <sung-shik.jongmans@swat.engineering>
Date: Fri, 6 Sep 2024 15:01:24 +0200
Subject: [PATCH 16/16] Improve documentation

---
 rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc b/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc
index bd34f33..d5244e9 100644
--- a/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc
+++ b/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc
@@ -17,8 +17,8 @@ set[&T] unmaybe(Maybe[set[&T]] _: just(set[&T] \set))
     = \set;
 
 @synopsis{
-    Returns just the union of the sets of two `Maybe` values when present.
-    Returns nothing if absent.
+    Returns just the union of the sets of two `Maybe` values when both are
+    present. Returns nothing if at least one is absent.
 }
 
 Maybe[set[&T]] union(just(set[&T] set1), just(set[&T] set2))