Merge pull request #15 from SWAT-engineering/identify-newline-separat…

…ed-segments Identify newline separated segments
SWAT-engineering · Sep 6, 2024 · 17df6e6 · 17df6e6
2 parents c7ed45e + 27c26ef
commit 17df6e6
Show file tree

Hide file tree

Showing 9 changed files with 417 additions and 46 deletions.
diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc
@@ -31,15 +31,6 @@ bool tryParse(Grammar g, Symbol s, str input, bool allowAmbiguity = false) {
     return false;
 }
 
-@synopsis{
-    Gets the terminals that occur in production `p`, possibly recursively
-    (default: `true`)
-}
-
-set[Symbol] getTerminals(Grammar g, Production p, bool recur = true)
-    = {s | s <- p.symbols, !isNonTerminalType(s)}
-    + {*getTerminals(g, child) | recur, s <- p.symbols, child <- lookup(g, s)};
-
 @synopsis{
     Lookups a list of productions for symbol `s` in grammar `g`, replacing
     formal parameters with actual parameters when needed
@@ -84,21 +75,26 @@ Symbol expand(\iter-star-seps(symbol, separators))
     Removes the label from symbol `s`, if any
 }
 
-Symbol delabel(label(_, Symbol s)) = s;
-default Symbol delabel(Symbol s)   = s;
+Symbol delabel(\label(_, Symbol s)) = delabel(s);
+default Symbol delabel(Symbol s)    = s;
 
 @synopsis{
     Removes operators `?` and `*` from symbol `s`, if any
 }
 
-Symbol destar(label(name, symbol))
+Symbol destar(\label(name, symbol))
     = label(name, destar(symbol));
+
 Symbol destar(\opt(symbol))
     = destar(symbol);
 Symbol destar(\iter-star(symbol))
     = \iter(destar(symbol));
 Symbol destar(\iter-star-seps(symbol, separators))
     = \iter-seps(destar(symbol), separators);
+Symbol destar(\seq([symbol]))
+    = \seq([destar(symbol)]);
+Symbol destar(\alt({symbol}))
+    = \alt({destar(symbol)});
 
 default Symbol destar(Symbol s) = s;
 

diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc
@@ -7,17 +7,153 @@ module lang::rascal::grammar::analyze::Newlines
 import Grammar;
 import ParseTree;
 import String;
+import util::Maybe;
 
 import lang::rascal::grammar::Util;
+import util::MaybeUtil;
+
+@synopsis{
+    Representation of a *newline-free* segment of symbols. A segment is
+    *initial* when it occurs first in a production/list of symbols; it is
+    *final* when it occurs last.
+}
+
+data Segment = segment(
+    list[Symbol] symbols,
+    bool initial = false,
+    bool final = false);
+
+@synopsis{
+    Gets the (newline-free) segments of a production/list of symbols in grammar
+    `g`, separated by symbols that have a newline (not part of any segment),
+    recursively for non-terminals. For instance, the segments of
+    `[lit("foo"), lit("bar"), lit("\n"), lit("baz")]` are:
+      - `[lit("foo"), lit("bar")]`;
+      - `[lit("baz")]`.
+}
+
+set[Segment] getSegments(Grammar g, Production p) {
+    return unmaybe(getSegmentsByProduction(g)[p]);
+}
+
+set[Segment] getSegments(Grammar g, list[Symbol] symbols) {
+    map[Production, Maybe[set[Segment]]] env = getSegmentsByProduction(g);
+    return unmaybe(getSegmentsWithEnvironment(g, symbols, env));
+}
+
+@memo
+private map[Production, Maybe[set[Segment]]] getSegmentsByProduction(Grammar g) {
+    map[Production, Maybe[set[Segment]]] ret = (p : nothing() | /p: prod(_, _, _) := g);
+
+    solve (ret) {
+        for (p <- ret, nothing() == ret[p]) {
+            ret[p] = getSegmentsWithEnvironment(g, p.symbols, ret);
+        }
+    }
+
+    return ret;
+}
+
+private Maybe[set[Segment]] getSegmentsWithEnvironment(
+        Grammar g, list[Symbol] symbols, 
+        map[Production, Maybe[set[Segment]]] env) {
+
+    // General idea: Recursively traverse `symbols` from left to right, while
+    // keeping track of a "running segment" (initially empty). Each time a
+    // symbol that has a newline is encountered, finish/collect the running
+    // segment, and start a new one for the remainder of `symbols`.
+
+    // Base case: No symbols remaining
+    Maybe[set[Segment]] get(Segment running, [], bool final = true) {
+        return just(_ <- running.symbols ? {running[final = final]} : {});
+    }
+
+    // Recursive case: At least one symbol remaining
+    Maybe[set[Segment]] get(Segment running, [Symbol head, *Symbol tail]) {
+        set[Symbol] nested = {s | /Symbol s := head};
+
+        Maybe[set[Segment]] finished = get(running, [], final = tail == []);
+        
+        // If the head contains a non-terminal, then: (1) finish the running
+        // segment; (2) lookup the segments of the non-terminals in the
+        // environment, if any; (3) compute the segments of the tail. Return the
+        // union of 1-3.
+        if (any(s <- nested, isNonTerminalType(s))) {
+            list[Maybe[set[Segment]]] sets = [];
+
+            // (1)
+            sets += finished;
+
+            // (2)
+            sets += for (s <- nested, isNonTerminalType(s), p <- lookup(g, s)) {
+
+                bool isInitial(Segment seg)
+                    = seg.initial && running.initial && running.symbols == [];
+                bool isFinal(Segment seg)
+                    = seg.final && tail == [];
+                Segment update(Segment seg)
+                    = seg[initial = isInitial(seg)][final = isFinal(seg)];
+                
+                append just(segs) := env[p] ? just({update(seg) | seg <- segs}) : nothing();
+            }
+
+            // (3)
+            sets += get(segment([]), tail);
+
+            // Return union
+            return (sets[0] | union(it, \set) | \set <- sets[1..]);
+        }
+        
+        // If the head doesn't contain a non-terminal, but it has a newline,
+        // then: (1) finish the running segment; (2) compute the segments of the
+        // tail. Return the union of 1-2. Note: the head, as it has a newline,
+        // is ignored and won't be part of any segment.
+        else if (any(s <- nested, hasNewline(g, s))) {
+            return union(finished, get(segment([]), tail));
+        }
+        
+        // If the head doesn't contain a non-terminal, and if it doesn't have a
+        // newline, then add the head to the running segment and proceed with
+        // the tail.
+        else {
+            Segment old = running;
+            Segment new = old[symbols = old.symbols + head]; 
+            return get(new, tail);
+        }
+    }
+
+    return get(segment([], initial = true), symbols);
+}
+
+@synopsis{
+    Checks if a symbol has a newline character
+}
+
+bool hasNewline(Grammar g, Symbol s) {
+    return any(p <- lookup(g, delabel(s)), hasNewline(g, p));
+}
 
 @synopsis{
     Checks if a production has a newline character
 }
 
-bool hasNewline(Grammar g, prod(_, symbols, _)) {
-    set[Symbol] nonTerminals = {s | /Symbol s := symbols, isNonTerminalType(s)};
-    return any(/r: range(_, _) := symbols, hasNewline(r)) ||
-        any(s <- nonTerminals, Production p <- lookup(g, s), hasNewline(g, p));
+bool hasNewline(Grammar g, Production p) {
+    return hasNewlineByProduction(g)[p];
+}
+
+@memo
+private map[Production, bool] hasNewlineByProduction(Grammar g) {
+    map[Production, bool] ret = (p: false | /p: prod(_, _, _) := g);
+
+    solve (ret) {
+        for (p <- ret, !ret[p]) {
+            set[Symbol] nonTerminals = {s | /Symbol s := p.symbols, isNonTerminalType(s)};
+            ret[p] = ret[p] || any(/r: range(_, _) := p.symbols, hasNewline(r))
+                            || any(s <- nonTerminals, Production child <- lookup(g, s), ret[child]);
+        }
+    }
+
+    return ret;
 }
 
 @synopsis{

diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc
@@ -17,9 +17,12 @@ module lang::rascal::grammar::analyze::Symbols
 
 import Grammar;
 import ParseTree;
+import String;
+import util::Math;
 import util::Maybe;
 
 import lang::rascal::grammar::Util;
+import util::MaybeUtil;
 
 @synopsis{
     Representation of a traversal direction along a list of symbols
@@ -55,9 +58,9 @@ private map[Symbol, Maybe[set[Symbol]]] firstBySymbol(Grammar g, bool(Symbol) pr
 
     Maybe[set[Symbol]] firstOf([])
         = just({});
-    Maybe[set[Symbol]] firstOf([h, *t])
+    Maybe[set[Symbol]] firstOf([Symbol h, *Symbol t])
         = \set: just({\empty(), *_}) := ret[delabel(h)]
-        ? union(\set, firstOf(t))
+        ? util::MaybeUtil::union(\set, firstOf(t))
         : ret[delabel(h)];
 
     solve (ret) {
@@ -112,19 +115,61 @@ private map[Symbol, Maybe[set[Symbol]]] followBySymbol(Grammar g, bool(Symbol) p
     return ret;
 }
 
-private set[Symbol] unmaybe(just(set[Symbol] \set))
-    = \set;
-private set[Symbol] unmaybe(nothing())
-    = {};
-
-private Maybe[set[Symbol]] union(just(set[Symbol] \set1), just(set[Symbol] \set2))
-    = just(\set1 + \set2);
-private default Maybe[set[Symbol]] union(Maybe[set[Symbol]] _, Maybe[set[Symbol]] _)
-    = nothing();
-
 @synopsis{
     Checks if symbol `s` is a terminal
 }
 
 bool isTerminal(Symbol s)
-    = !isNonTerminalType(s);
+    = !isNonTerminalType(s);
+
+@synposis{
+    Sorts list of terminals `symbols` by minimum length (in ascending order)
+}
+
+list[Symbol] sortByMinimumLength(list[Symbol] symbols) {
+    bool less(Symbol s1, Symbol s2) = length(s1).min < length(s2).min;
+    return sort(symbols, less);
+}
+
+@synopsis{
+    Representation of the minimum length and the maximum length of the text
+    produced by a symbol. If `max` is `nothing()`, then the text produced is
+    statically unbounded.
+}
+
+alias Range = tuple[int min, Maybe[int] max];
+
+private Range ZERO = <0, just(0)>;
+private Range seq(Range r1, Range r2) = <r1.min + r2.min, add(r1.max, r2.max)>;
+private Range alt(Range r1, Range r2) = <min(r1.min, r2.min), max(r1.max, r2.max)>;
+
+private Maybe[int] add(just(int i), just(int j)) = just(i + j);
+private default Maybe[int] add(Maybe[int] _, Maybe[int] _) = nothing();
+
+private Maybe[int] max(just(int i), just(int j)) = just(max(i, j));
+private default Maybe[int] max(Maybe[int] _, Maybe[int] _) = nothing();
+
+@synopsis{
+    Computes the length of a terminal symbol as a range
+}
+
+Range length(\lit(string))   = <size(string), just(size(string))>;
+Range length(\cilit(string)) = <size(string), just(size(string))>;
+Range length(\char-class(_)) = <1, just(1)>;
+
+Range length(\empty())                   = ZERO;
+Range length(\opt(symbol))               = length(symbol)[min = 0];
+Range length(\iter(symbol))              = length(symbol)[max = issue2007];
+Range length(\iter-star(symbol))         = <0, max: just(0) := length(symbol).max ? max : nothing()>;
+Range length(\iter-seps(symbol, _))      = length(symbol)[max = issue2007];
+Range length(\iter-star-seps(symbol, _)) = <0, max: just(0) := length(symbol).max ? max : nothing()>;
+Range length(\alt(alternatives))         = {Symbol first, *Symbol rest} := alternatives
+                                         ? (length(first) | alt(it, length(s)) | s <- rest)
+                                         : ZERO;
+Range length(\seq(symbols))              = (ZERO | seq(it, length(s)) | s <- symbols);
+
+Range length(\conditional(symbol, _)) = length(symbol);
+
+// TODO: Remove this workaround when issue #2007 is fixed:
+//   - https://github.com/usethesource/rascal/issues/2007
+private Maybe[int] issue2007 = nothing();
diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
@@ -16,6 +16,7 @@ import lang::rascal::grammar::Util;
 import lang::rascal::grammar::analyze::Delimiters;
 import lang::rascal::grammar::analyze::Dependencies;
 import lang::rascal::grammar::analyze::Newlines;
+import lang::rascal::grammar::analyze::Symbols;
 import lang::textmate::ConversionConstants;
 import lang::textmate::ConversionUnit;
 import lang::textmate::Grammar;
@@ -215,13 +216,18 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
             // Simple case: each unit does have an `end` inner delimiter
             if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) {
 
-                // Compute a list of terminals that need to be consumed between
+                // Compute a list of segments that need to be consumed between
                 // the `begin` delimiter and the `end` delimiters. Each of these
-                // terminals will be converted to a match pattern.
-                list[Symbol] terminals = [*getTerminals(rsc, u.prod) | u <- group];
-                terminals = [s | s <- terminals, s notin begins && s notin ends];
+                // segments will be converted to a match pattern.
+                set[Segment] segs = {*getSegments(rsc, u.prod) | u <- group};
+                segs = {removeBeginEnd(seg, begins, ends) | seg <- segs};
+
+                list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
+                terminals = [s | s <- terminals, [] != s.symbols];
                 terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
                 terminals = dup(terminals);
+                terminals = sortByMinimumLength(terminals); // Small symbols first
+                terminals = reverse(terminals); // Large symbols first
                 terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
                 
                 TmRule r = toTmRule(
@@ -288,6 +294,18 @@ private list[ConversionUnit] addOuterRules(list[ConversionUnit] units) {
     // precision than a unit-driven approach; I suspect it might.
 }
 
+private Segment removeBeginEnd(Segment seg, set[Symbol] begins, set[Symbol] ends) {
+    list[Symbol] symbols = seg.symbols;
+    if (seg.initial, _ <- symbols, symbols[0] in begins) {
+        symbols = symbols[1..];
+    }
+    if (seg.final, _ <- symbols, symbols[-1] in ends) {
+        symbols = symbols[..-1];
+    }
+    
+    return seg[symbols = symbols];
+}
+
 // TODO: This function could be moved to a separate, generic module
 private list[&T] dupLast(list[&T] l)
     = reverse(dup(reverse(l))); // TODO: Optimize/avoid `reverse`-ing?

diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc
@@ -44,7 +44,11 @@ syntax Expression
 
 lexical Id = ([a-z][a-z0-9]*) !>> [a-z0-9] \ Keyword;
 lexical Natural = [0-9]+ !>> [0-9];
-lexical String = "\"" ![\"]* "\"";
+lexical String = "\"" Char* "\"";
+
+lexical Char
+    = ![\\\"]
+    | "\\" [\\\"];
 
 keyword Keyword
     = "begin"
@@ -70,7 +74,7 @@ lexical WhitespaceAndComment
 Grammar rsc = preprocess(grammar(#Program));
 
 list[ConversionUnit] units = [
-    unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":=")})],{}), false, <nothing(),nothing()>, <nothing(),nothing()>),
+    unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":="),lit("\\")})],{}), false, <nothing(),nothing()>, <nothing(),nothing()>),
     unit(rsc, prod(label("natural",sort("Type")),[lit("natural")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
     unit(rsc, prod(label("nil",sort("Type")),[lit("nil-type")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <just(lit("nil-type")),just(lit("nil-type"))>),
     unit(rsc, prod(label("string",sort("Type")),[lit("string")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),