From dd9d24af6290e9f0f181f87834b688ad942f4923 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 23 Aug 2024 16:09:20 +0200 Subject: [PATCH 01/16] Extend `destar` with cases for `\seq` and `\alt` --- .../src/main/rascal/lang/rascal/grammar/Util.rsc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc index a538be6..7830b76 100644 --- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc @@ -84,21 +84,26 @@ Symbol expand(\iter-star-seps(symbol, separators)) Removes the label from symbol `s`, if any } -Symbol delabel(label(_, Symbol s)) = s; -default Symbol delabel(Symbol s) = s; +Symbol delabel(\label(_, Symbol s)) = delabel(s); +default Symbol delabel(Symbol s) = s; @synopsis{ Removes operators `?` and `*` from symbol `s`, if any } -Symbol destar(label(name, symbol)) +Symbol destar(\label(name, symbol)) = label(name, destar(symbol)); + Symbol destar(\opt(symbol)) = destar(symbol); Symbol destar(\iter-star(symbol)) = \iter(destar(symbol)); Symbol destar(\iter-star-seps(symbol, separators)) = \iter-seps(destar(symbol), separators); +Symbol destar(\seq([symbol])) + = \seq([destar(symbol)]); +Symbol destar(\alt({symbol})) + = \alt({destar(symbol)}); default Symbol destar(Symbol s) = s; From 8155fb6618c0b0c32b51f9587e1be3b1fec4ed5c Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 23 Aug 2024 16:09:49 +0200 Subject: [PATCH 02/16] Add module `MaybeUtil` --- .../src/main/rascal/util/MaybeUtil.rsc | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc diff --git a/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc b/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc new file mode 100644 index 0000000..bd34f33 --- /dev/null +++ b/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc @@ -0,0 +1,28 @@ +@synopsis{ + Utility functions for `Maybe` values +} + +module util::MaybeUtil + +import util::Maybe; + +@synopsis{ + Returns the set of a `Maybe` value when present. Returns the empty set when + absent. +} + +set[&T] unmaybe(Maybe[set[&T]] _: nothing()) + = {}; +set[&T] unmaybe(Maybe[set[&T]] _: just(set[&T] \set)) + = \set; + +@synopsis{ + Returns just the union of the sets of two `Maybe` values when present. + Returns nothing if absent. +} + +Maybe[set[&T]] union(just(set[&T] set1), just(set[&T] set2)) + = just(set1 + set2); + +default Maybe[set[&T]] union(Maybe[set[&T]] _, Maybe[set[&T]] _) + = nothing(); \ No newline at end of file From 689545cb4f7bdbce118be246afea406fcdfb9e0f Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 23 Aug 2024 16:10:57 +0200 Subject: [PATCH 03/16] Use new module `MaybeUtil` in existing code --- .../rascal/lang/rascal/grammar/analyze/Symbols.rsc | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc index a1b01b8..320252e 100644 --- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc @@ -20,6 +20,7 @@ import ParseTree; import util::Maybe; import lang::rascal::grammar::Util; +import util::MaybeUtil; @synopsis{ Representation of a traversal direction along a list of symbols @@ -112,16 +113,6 @@ private map[Symbol, Maybe[set[Symbol]]] followBySymbol(Grammar g, bool(Symbol) p return ret; } -private set[Symbol] unmaybe(just(set[Symbol] \set)) - = \set; -private set[Symbol] unmaybe(nothing()) - = {}; - -private Maybe[set[Symbol]] union(just(set[Symbol] \set1), just(set[Symbol] \set2)) - = just(\set1 + \set2); -private default Maybe[set[Symbol]] union(Maybe[set[Symbol]] _, Maybe[set[Symbol]] _) - = nothing(); - @synopsis{ Checks if symbol `s` is a terminal } From c464799d0110b8128c4f082afa2f7291b4e757a7 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 23 Aug 2024 16:17:22 +0200 Subject: [PATCH 04/16] Add function to compute the newline-separated segments of a list of symbols --- .../lang/rascal/grammar/analyze/Newlines.rsc | 120 +++++++++++++++++- 1 file changed, 116 insertions(+), 4 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc index 6bdb3ca..c0f5d6d 100644 --- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc @@ -7,17 +7,129 @@ module lang::rascal::grammar::analyze::Newlines import Grammar; import ParseTree; import String; +import util::Maybe; import lang::rascal::grammar::Util; +import util::MaybeUtil; + +@synopsis{ + Representation of a *newline-free* segment of symbols +} + +alias Segment = list[Symbol]; + +@synopsis{ + Gets the (newline-free) segments of a production/list of symbols in grammar + `g`, separated by symbols that have a newline (not part of any segment), + recursively for non-terminals. For instance, the segments of + `[lit("foo"), lit("bar"), lit("\n"), lit("baz")]` are: + - `[lit("foo"), lit("bar")]`; + - `[lit("baz")]`. +} + +set[Segment] getSegments(Grammar g, Production p) { + return unmaybe(getSegmentsByProduction(g)[p]); +} + +set[Segment] getSegments(Grammar g, list[Symbol] symbols) { + map[Production, Maybe[set[Segment]]] env = getSegmentsByProduction(g); + return unmaybe(getSegmentsWithEnvironment(g, symbols, env)); +} + +@memo +private map[Production, Maybe[set[Segment]]] getSegmentsByProduction(Grammar g) { + map[Production, Maybe[set[Segment]]] ret = (p : nothing() | /p: prod(_, _, _) := g); + + solve (ret) { + for (p <- ret, nothing() == ret[p]) { + ret[p] = getSegmentsWithEnvironment(g, p.symbols, ret); + } + } + + return ret; +} + +private Maybe[set[Segment]] getSegmentsWithEnvironment( + Grammar g, list[Symbol] symbols, + map[Production, Maybe[set[Segment]]] env) { + + // General idea: Recursively traverse `symbols` from left to right, while + // keeping track of a "running segment" (initially empty). Each time a + // symbol that has a newline is encountered, finish/collect the running + // segment, and start a new one for the remainder of `symbols`. + + // Final case: No symbols remaining + Maybe[set[Segment]] get(Segment runningSegment, []) { + return just(_ <- runningSegment ? {runningSegment} : {}); + } + + // Recursive case: At least one symbol remaining + Maybe[set[Segment]] get(Segment segment, [Symbol head, *Symbol tail]) { + set[Symbol] nested = {s | /Symbol s := head}; + + // If the head contains a non-terminal, then: (1) finish the running + // segment; (2) lookup the segments of the non-terminals in the + // environment, if any; (3) compute the segments of the tail. Return the + // union of 1-3. + if (any(s <- nested, isNonTerminalType(s))) { + + list[Maybe[set[Segment]]] sets + = [get(segment, [])] // (1) + + [env[p] | s <- nested, isNonTerminalType(s), p <- lookup(g, s)] // (2) + + [get([], tail)]; // (3) + + return (sets[0] | union(it, \set) | \set <- sets[1..]); + + } + + // If the head doesn't contain a non-terminal, but it has a newline, + // then: (1) finish the running segment; (2) compute the segments of the + // tail. Return the union of 1-2. Note: the head is ignored and won't be + // part of any segment. + else if (any(s <- nested, hasNewline(g, s))) { + return union(get(segment, []), get([], tail)); + } + + // If the head doesn't contain a non-terminal, and if it doesn't have a + // newline, then add the head to the running segment and proceed with + // the tail. + else { + return get(segment + head, tail); + } + } + + return get([], symbols); +} + +@synopsis{ + Checks if a symbol has a newline character +} + +bool hasNewline(Grammar g, Symbol s) { + return any(p <- lookup(g, delabel(s)), hasNewline(g, p)); +} @synopsis{ Checks if a production has a newline character } -bool hasNewline(Grammar g, prod(_, symbols, _)) { - set[Symbol] nonTerminals = {s | /Symbol s := symbols, isNonTerminalType(s)}; - return any(/r: range(_, _) := symbols, hasNewline(r)) || - any(s <- nonTerminals, Production p <- lookup(g, s), hasNewline(g, p)); +bool hasNewline(Grammar g, Production p) { + return hasNewlineByProduction(g)[p]; +} + +@memo +private map[Production, bool] hasNewlineByProduction(Grammar g) { + map[Production, bool] ret = (p: false | /p: prod(_, _, _) := g); + + solve (ret) { + for (p <- ret, !ret[p]) { + set[Symbol] nonTerminals = {s | /Symbol s := p.symbols, isNonTerminalType(s)}; + ret[p] = ret[p] || any(/r: range(_, _) := p.symbols, hasNewline(r)) + || any(s <- nonTerminals, Production child <- lookup(g, s), ret[child]); + } + } + + return ret; } @synopsis{ From 0c3b83bf54008c05a7cdcf164ead9899a0d3a95b Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 23 Aug 2024 16:20:52 +0200 Subject: [PATCH 05/16] Use segments (instead of terminals) in the generation of begin/end patterns --- .../src/main/rascal/lang/rascal/grammar/Util.rsc | 9 --------- .../src/main/rascal/lang/textmate/Conversion.rsc | 14 ++++++++++---- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc index 7830b76..a7ad3ad 100644 --- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc @@ -31,15 +31,6 @@ bool tryParse(Grammar g, Symbol s, str input, bool allowAmbiguity = false) { return false; } -@synopsis{ - Gets the terminals that occur in production `p`, possibly recursively - (default: `true`) -} - -set[Symbol] getTerminals(Grammar g, Production p, bool recur = true) - = {s | s <- p.symbols, !isNonTerminalType(s)} - + {*getTerminals(g, child) | recur, s <- p.symbols, child <- lookup(g, s)}; - @synopsis{ Lookups a list of productions for symbol `s` in grammar `g`, replacing formal parameters with actual parameters when needed diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc index 2bab444..e6b28c9 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc @@ -215,11 +215,17 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) { // Simple case: each unit does have an `end` inner delimiter if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) { - // Compute a list of terminals that need to be consumed between + // Compute a list of segments that need to be consumed between // the `begin` delimiter and the `end` delimiters. Each of these - // terminals will be converted to a match pattern. - list[Symbol] terminals = [*getTerminals(rsc, u.prod) | u <- group]; - terminals = [s | s <- terminals, s notin begins && s notin ends]; + // segments will be converted to a match pattern. + set[list[Symbol]] segments = {*getSegments(rsc, u.prod) | u <- group}; + + list[Symbol] terminals + = [\seq([ *ys ]) | [x, *ys, z] <- segments, x == begin, z in ends] + + [\seq([ *ys, z]) | [x, *ys, z] <- segments, x == begin, z notin ends] + + [\seq([x, *ys ]) | [x, *ys, z] <- segments, x != begin, z in ends] + + [\seq([x, *ys, z]) | [x, *ys, z] <- segments, x != begin, z notin ends]; + terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly terminals = dup(terminals); terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback) From 91b7ca9f98dca90b9b305c43b1107785a97c0ad0 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 23 Aug 2024 16:21:23 +0200 Subject: [PATCH 06/16] Update generated TextMate grammar for Rascal/Pico --- .../syntaxes/pico.tmLanguage.json | 2 +- .../syntaxes/rascal.tmLanguage.json | 96 ++++++++++++++++++- 2 files changed, 93 insertions(+), 5 deletions(-) diff --git a/vscode-extension/syntaxes/pico.tmLanguage.json b/vscode-extension/syntaxes/pico.tmLanguage.json index 2a57152..dda2f4d 100644 --- a/vscode-extension/syntaxes/pico.tmLanguage.json +++ b/vscode-extension/syntaxes/pico.tmLanguage.json @@ -14,7 +14,7 @@ "end": "(\\\")", "patterns": [ { - "match": "((?:\\\")[\\x{01}-\\!\\#-\\x{10FFFF}]*?(?:\\\"))", + "match": "([\\x{01}-\\!\\#-\\x{10FFFF}]+?)", "captures": { "1": { "name": "string.quoted.double" diff --git a/vscode-extension/syntaxes/rascal.tmLanguage.json b/vscode-extension/syntaxes/rascal.tmLanguage.json index 1216e9b..7f37239 100644 --- a/vscode-extension/syntaxes/rascal.tmLanguage.json +++ b/vscode-extension/syntaxes/rascal.tmLanguage.json @@ -221,7 +221,39 @@ "end": "((?:\\\")|(?:\\<))", "patterns": [ { - "match": "((?:(?:(?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])|[\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}]|(?:(?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))|(?:(?:(?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)a[0-7][0-9A-Fa-f])))+?)", + "match": "((?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", + "captures": { + "1": { + "name": "string.quoted.double" + } + } + }, + { + "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))", + "captures": { + "1": { + "name": "string.quoted.double" + } + } + }, + { + "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])", + "captures": { + "1": { + "name": "string.quoted.double" + } + } + }, + { + "match": "((?:\\\\)a[0-7][0-9A-Fa-f])", + "captures": { + "1": { + "name": "string.quoted.double" + } + } + }, + { + "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", "captures": { "1": { "name": "string.quoted.double" @@ -254,7 +286,7 @@ "end": "((?:\\\")|(?:\\<))", "patterns": [ { - "match": "((?:(?:(?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])|[\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}]|(?:(?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))|(?:(?:(?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)a[0-7][0-9A-Fa-f])))+?)", + "match": "((?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", "captures": { "1": { "name": "string.quoted.double" @@ -262,7 +294,31 @@ } }, { - "match": "((?:(?:(?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])|[\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}]|(?:(?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))|(?:(?:(?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)a[0-7][0-9A-Fa-f])))+?)", + "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))", + "captures": { + "1": { + "name": "string.quoted.double" + } + } + }, + { + "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])", + "captures": { + "1": { + "name": "string.quoted.double" + } + } + }, + { + "match": "((?:\\\\)a[0-7][0-9A-Fa-f])", + "captures": { + "1": { + "name": "string.quoted.double" + } + } + }, + { + "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", "captures": { "1": { "name": "string.quoted.double" @@ -477,7 +533,39 @@ "end": "(\\')", "patterns": [ { - "match": "((?:(?:(?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])|[\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}]|(?:(?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))|(?:(?:(?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])|(?:(?:\\\\)a[0-7][0-9A-Fa-f])))+?)", + "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?)", + "captures": { + "1": { + "name": "string.quoted.single" + } + } + }, + { + "match": "((?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", + "captures": { + "1": { + "name": "string.quoted.single" + } + } + }, + { + "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])", + "captures": { + "1": { + "name": "string.quoted.single" + } + } + }, + { + "match": "((?:\\\\)a[0-7][0-9A-Fa-f])", + "captures": { + "1": { + "name": "string.quoted.single" + } + } + }, + { + "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", "captures": { "1": { "name": "string.quoted.single" From 7951a7ed3b1de648618ed14a8bc0e2ab6da92392 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Mon, 2 Sep 2024 11:27:16 +0200 Subject: [PATCH 07/16] Add a few clarifying comments --- .../src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc | 5 ++--- .../src/main/rascal/lang/textmate/Conversion.rsc | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc index c0f5d6d..e864dda 100644 --- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc @@ -79,13 +79,12 @@ private Maybe[set[Segment]] getSegmentsWithEnvironment( + [get([], tail)]; // (3) return (sets[0] | union(it, \set) | \set <- sets[1..]); - } // If the head doesn't contain a non-terminal, but it has a newline, // then: (1) finish the running segment; (2) compute the segments of the - // tail. Return the union of 1-2. Note: the head is ignored and won't be - // part of any segment. + // tail. Return the union of 1-2. Note: the head, as it has a newline, + // is ignored and won't be part of any segment. else if (any(s <- nested, hasNewline(g, s))) { return union(get(segment, []), get([], tail)); } diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc index e6b28c9..1d6922f 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc @@ -220,7 +220,7 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) { // segments will be converted to a match pattern. set[list[Symbol]] segments = {*getSegments(rsc, u.prod) | u <- group}; - list[Symbol] terminals + list[Symbol] terminals // Remove `begin` and `end` from the segments = [\seq([ *ys ]) | [x, *ys, z] <- segments, x == begin, z in ends] + [\seq([ *ys, z]) | [x, *ys, z] <- segments, x == begin, z notin ends] + [\seq([x, *ys ]) | [x, *ys, z] <- segments, x != begin, z in ends] From d805b5a11db5ea4b95c13a64a697243578059f35 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Mon, 2 Sep 2024 12:01:20 +0200 Subject: [PATCH 08/16] Add tests --- .../textmate/conversiontests/PicoWithCategories.rsc | 8 ++++++-- .../conversiontests/PicoWithCategories.test | 13 ++++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc index 36db90d..9a81068 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc @@ -44,7 +44,11 @@ syntax Expression lexical Id = ([a-z][a-z0-9]*) !>> [a-z0-9] \ Keyword; lexical Natural = [0-9]+ !>> [0-9]; -lexical String = "\"" ![\"]* "\""; +lexical String = "\"" Char* "\""; + +lexical Char + = ![\"] + | "\\" [\"]; keyword Keyword = "begin" @@ -70,7 +74,7 @@ lexical WhitespaceAndComment Grammar rsc = preprocess(grammar(#Program)); list[ConversionUnit] units = [ - unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":=")})],{}), false, , ), + unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":="),lit("\\")})],{}), false, , ), unit(rsc, prod(label("natural",sort("Type")),[lit("natural")],{\tag("category"("storage.type"))}), false, , ), unit(rsc, prod(label("nil",sort("Type")),[lit("nil-type")],{\tag("category"("storage.type"))}), false, , ), unit(rsc, prod(label("string",sort("Type")),[lit("string")],{\tag("category"("storage.type"))}), false, , ), diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test index 8a4aabe..a5a6459 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test @@ -48,9 +48,16 @@ # ^ -string.quoted.double # ^^^^^ string.quoted.double - "foo\" bar -# ^^^^^^ string.quoted.double -# ^^^^ -string.quoted.double + "foo" bar +# ^^^^^ string.quoted.double +# ^^^^ -string.quoted.double + + "foo\" bar" +# ^^^^^^^^^^^ string.quoted.double + + "foo\\" bar +# ^^^^^^^ string.quoted.double +# ^^^^ -string.quoted.double "foo # ^^^^ string.quoted.double From 40a4712280ccd47e0d9c678f927163477c88ae6b Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Mon, 2 Sep 2024 14:23:54 +0200 Subject: [PATCH 09/16] Add another test and fix a bug to make the test pass --- .../lang/rascal/grammar/analyze/Newlines.rsc | 53 ++++++++++++++----- .../main/rascal/lang/textmate/Conversion.rsc | 24 ++++++--- .../conversiontests/PicoWithCategories.test | 7 +++ 3 files changed, 62 insertions(+), 22 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc index e864dda..b4e0d0b 100644 --- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc @@ -13,10 +13,15 @@ import lang::rascal::grammar::Util; import util::MaybeUtil; @synopsis{ - Representation of a *newline-free* segment of symbols + Representation of a *newline-free* segment of symbols. A segment is + *initial* when it occurs first in a production/list of symbols; it is + *final* when it occurs last. } -alias Segment = list[Symbol]; +data Segment = segment( + list[Symbol] symbols, + bool initial = false, + bool final = false); @synopsis{ Gets the (newline-free) segments of a production/list of symbols in grammar @@ -58,26 +63,44 @@ private Maybe[set[Segment]] getSegmentsWithEnvironment( // symbol that has a newline is encountered, finish/collect the running // segment, and start a new one for the remainder of `symbols`. - // Final case: No symbols remaining - Maybe[set[Segment]] get(Segment runningSegment, []) { - return just(_ <- runningSegment ? {runningSegment} : {}); + // Base case: No symbols remaining + Maybe[set[Segment]] get(Segment running, [], bool final = true) { + return just(_ <- running.symbols ? {running[final = final]} : {}); } // Recursive case: At least one symbol remaining - Maybe[set[Segment]] get(Segment segment, [Symbol head, *Symbol tail]) { + Maybe[set[Segment]] get(Segment running, [Symbol head, *Symbol tail]) { set[Symbol] nested = {s | /Symbol s := head}; + + Maybe[set[Segment]] finished = get(running, [], final = tail == []); // If the head contains a non-terminal, then: (1) finish the running // segment; (2) lookup the segments of the non-terminals in the // environment, if any; (3) compute the segments of the tail. Return the // union of 1-3. if (any(s <- nested, isNonTerminalType(s))) { + list[Maybe[set[Segment]]] sets = []; + + // (1) + sets += finished; + + // (2) + sets += for (s <- nested, isNonTerminalType(s), p <- lookup(g, s)) { + + bool isInitial(Segment seg) + = seg.initial && running.initial && running.symbols == []; + bool isFinal(Segment seg) + = seg.final && tail == []; + Segment update(Segment seg) + = seg[initial = isInitial(seg)][final = isFinal(seg)]; + + append just(segs) := env[p] ? just({update(seg) | seg <- segs}) : nothing(); + } + + // (3) + sets += get(segment([]), tail); - list[Maybe[set[Segment]]] sets - = [get(segment, [])] // (1) - + [env[p] | s <- nested, isNonTerminalType(s), p <- lookup(g, s)] // (2) - + [get([], tail)]; // (3) - + // Return union return (sets[0] | union(it, \set) | \set <- sets[1..]); } @@ -86,18 +109,20 @@ private Maybe[set[Segment]] getSegmentsWithEnvironment( // tail. Return the union of 1-2. Note: the head, as it has a newline, // is ignored and won't be part of any segment. else if (any(s <- nested, hasNewline(g, s))) { - return union(get(segment, []), get([], tail)); + return union(finished, get(segment([]), tail)); } // If the head doesn't contain a non-terminal, and if it doesn't have a // newline, then add the head to the running segment and proceed with // the tail. else { - return get(segment + head, tail); + Segment old = running; + Segment new = old[symbols = old.symbols + head]; + return get(new, tail); } } - return get([], symbols); + return get(segment([], initial = true), symbols); } @synopsis{ diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc index 1d6922f..81b19b0 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc @@ -218,14 +218,22 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) { // Compute a list of segments that need to be consumed between // the `begin` delimiter and the `end` delimiters. Each of these // segments will be converted to a match pattern. - set[list[Symbol]] segments = {*getSegments(rsc, u.prod) | u <- group}; - - list[Symbol] terminals // Remove `begin` and `end` from the segments - = [\seq([ *ys ]) | [x, *ys, z] <- segments, x == begin, z in ends] - + [\seq([ *ys, z]) | [x, *ys, z] <- segments, x == begin, z notin ends] - + [\seq([x, *ys ]) | [x, *ys, z] <- segments, x != begin, z in ends] - + [\seq([x, *ys, z]) | [x, *ys, z] <- segments, x != begin, z notin ends]; - + list[Segment] segments = [*getSegments(rsc, u.prod) | u <- group]; + + Segment removeBeginEnd(Segment seg) { + list[Symbol] symbols = seg.symbols; + if (seg.initial, _ <- symbols, symbols[0] == begin) { + symbols = symbols[1..]; + } + if (seg.final, _ <- symbols, symbols[-1] in ends) { + symbols = symbols[..-1]; + } + + return seg[symbols = symbols]; + } + + list[Symbol] terminals = [\seq(removeBeginEnd(seg).symbols) | seg <- segments]; + terminals = [s | s <- terminals, [] != s.symbols]; terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly terminals = dup(terminals); terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback) diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test index a5a6459..df5915e 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test @@ -62,4 +62,11 @@ "foo # ^^^^ string.quoted.double bar" +# ^^^^ string.quoted.double + + "foo +# ^^^^ string.quoted.double + \" +# ^^ string.quoted.double + bar" # ^^^^ string.quoted.double \ No newline at end of file From 49547a4622dce01720eafa09d5571e3c5bb0afc6 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Tue, 3 Sep 2024 08:56:24 +0200 Subject: [PATCH 10/16] Update generated TextMate grammar for Rascal/Pico --- .../syntaxes/pico.tmLanguage.json | 14 ++++++-- .../syntaxes/rascal.tmLanguage.json | 32 ++++++++++++++++--- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/vscode-extension/syntaxes/pico.tmLanguage.json b/vscode-extension/syntaxes/pico.tmLanguage.json index dda2f4d..a489f46 100644 --- a/vscode-extension/syntaxes/pico.tmLanguage.json +++ b/vscode-extension/syntaxes/pico.tmLanguage.json @@ -14,7 +14,15 @@ "end": "(\\\")", "patterns": [ { - "match": "([\\x{01}-\\!\\#-\\x{10FFFF}]+?)", + "match": "((?:\\\\)(?:\\\"))", + "captures": { + "1": { + "name": "string.quoted.double" + } + } + }, + { + "match": "([\\x{01}-\\!\\#-\\x{10FFFF}])", "captures": { "1": { "name": "string.quoted.double" @@ -52,7 +60,7 @@ } }, "/inner/single/$delimiters": { - "match": "(?:(?:\\-)|(?:\\,)|(?:\\))|(?:\\()|(?:\\+)|(?:\\|\\|)|(?:\\:\\=))", + "match": "(?:(?:\\-)|(?:\\,)|(?:\\))|(?:\\()|(?:\\+)|(?:\\|\\|)|(?:\\:\\=)|(?:\\\\))", "name": "/inner/single/$delimiters", "captures": {} }, @@ -189,7 +197,7 @@ } }, "/inner/single/expression.strcon": { - "match": "((?<=(?:[\\t-\\n\\r\\x{20}\\%]|(?:(?:^))))(?:(?:\\\")[\\x{01}-\\!\\#-\\x{10FFFF}]*?(?:\\\")))", + "match": "((?<=(?:[\\t-\\n\\r\\x{20}\\%]|(?:(?:^))))(?:(?:\\\")(?:(?:(?:\\\\)(?:\\\"))|[\\x{01}-\\!\\#-\\x{10FFFF}])*?(?:\\\")))", "name": "/inner/single/expression.strcon", "captures": { "1": { diff --git a/vscode-extension/syntaxes/rascal.tmLanguage.json b/vscode-extension/syntaxes/rascal.tmLanguage.json index 7f37239..4095f9d 100644 --- a/vscode-extension/syntaxes/rascal.tmLanguage.json +++ b/vscode-extension/syntaxes/rascal.tmLanguage.json @@ -229,7 +229,7 @@ } }, { - "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))", + "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])", "captures": { "1": { "name": "string.quoted.double" @@ -244,6 +244,14 @@ } } }, + { + "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))", + "captures": { + "1": { + "name": "string.quoted.double" + } + } + }, { "match": "((?:\\\\)a[0-7][0-9A-Fa-f])", "captures": { @@ -294,7 +302,7 @@ } }, { - "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))", + "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])", "captures": { "1": { "name": "string.quoted.double" @@ -309,6 +317,14 @@ } } }, + { + "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))", + "captures": { + "1": { + "name": "string.quoted.double" + } + } + }, { "match": "((?:\\\\)a[0-7][0-9A-Fa-f])", "captures": { @@ -533,7 +549,7 @@ "end": "(\\')", "patterns": [ { - "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?)", + "match": "((?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", "captures": { "1": { "name": "string.quoted.single" @@ -541,7 +557,7 @@ } }, { - "match": "((?:\\\\)U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)[0-9A-Fa-f]))[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", + "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])", "captures": { "1": { "name": "string.quoted.single" @@ -556,6 +572,14 @@ } } }, + { + "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))", + "captures": { + "1": { + "name": "string.quoted.single" + } + } + }, { "match": "((?:\\\\)a[0-7][0-9A-Fa-f])", "captures": { From 05eb324fba5703fcbe54078bf977ec0ffb10b856 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 6 Sep 2024 11:41:10 +0200 Subject: [PATCH 11/16] Add utility functions to compute the expected min/max length of terminals --- .../lang/rascal/grammar/analyze/Symbols.rsc | 60 ++++++++++++++++++- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc index 320252e..27ba177 100644 --- a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc @@ -17,6 +17,8 @@ module lang::rascal::grammar::analyze::Symbols import Grammar; import ParseTree; +import String; +import util::Math; import util::Maybe; import lang::rascal::grammar::Util; @@ -56,9 +58,9 @@ private map[Symbol, Maybe[set[Symbol]]] firstBySymbol(Grammar g, bool(Symbol) pr Maybe[set[Symbol]] firstOf([]) = just({}); - Maybe[set[Symbol]] firstOf([h, *t]) + Maybe[set[Symbol]] firstOf([Symbol h, *Symbol t]) = \set: just({\empty(), *_}) := ret[delabel(h)] - ? union(\set, firstOf(t)) + ? util::MaybeUtil::union(\set, firstOf(t)) : ret[delabel(h)]; solve (ret) { @@ -118,4 +120,56 @@ private map[Symbol, Maybe[set[Symbol]]] followBySymbol(Grammar g, bool(Symbol) p } bool isTerminal(Symbol s) - = !isNonTerminalType(s); \ No newline at end of file + = !isNonTerminalType(s); + +@synposis{ + Sorts list of terminals `symbols` by minimum length (in ascending order) +} + +list[Symbol] sortByMinimumLength(list[Symbol] symbols) { + bool less(Symbol s1, Symbol s2) = length(s1).min < length(s2).min; + return sort(symbols, less); +} + +@synopsis{ + Representation of the minimum length and the maximum length of the text + produced by a symbol. If `max` is `nothing()`, then the text produced is + statically unbounded. +} + +alias Range = tuple[int min, Maybe[int] max]; + +private Range ZERO = <0, just(0)>; +private Range seq(Range r1, Range r2) = ; +private Range alt(Range r1, Range r2) = ; + +private Maybe[int] add(just(int i), just(int j)) = just(i + j); +private default Maybe[int] add(Maybe[int] _, Maybe[int] _) = nothing(); + +private Maybe[int] max(just(int i), just(int j)) = just(max(i, j)); +private default Maybe[int] max(Maybe[int] _, Maybe[int] _) = nothing(); + +@synopsis{ + Computes the length of a terminal symbol as a range +} + +Range length(\lit(string)) = ; +Range length(\cilit(string)) = ; +Range length(\char-class(_)) = <1, just(1)>; + +Range length(\empty()) = ZERO; +Range length(\opt(symbol)) = length(symbol)[min = 0]; +Range length(\iter(symbol)) = length(symbol)[max = issue2007]; +Range length(\iter-star(symbol)) = <0, max: just(0) := length(symbol).max ? max : nothing()>; +Range length(\iter-seps(symbol, _)) = length(symbol)[max = issue2007]; +Range length(\iter-star-seps(symbol, _)) = <0, max: just(0) := length(symbol).max ? max : nothing()>; +Range length(\alt(alternatives)) = {Symbol first, *Symbol rest} := alternatives + ? (length(first) | alt(it, length(s)) | s <- rest) + : ZERO; +Range length(\seq(symbols)) = (ZERO | seq(it, length(s)) | s <- symbols); + +Range length(\conditional(symbol, _)) = length(symbol); + +// TODO: Remove this workaround when issue #2007 is fixed: +// - https://github.com/usethesource/rascal/issues/2007 +private Maybe[int] issue2007 = nothing(); \ No newline at end of file From e869ffd9aa91ad0f306fdaf6f56e7e153a7b3a69 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 6 Sep 2024 11:47:08 +0200 Subject: [PATCH 12/16] Add sorting for terminals-to-gobble (based on segments) --- .../src/main/rascal/lang/textmate/Conversion.rsc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc index 81b19b0..03ea76f 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc @@ -16,6 +16,7 @@ import lang::rascal::grammar::Util; import lang::rascal::grammar::analyze::Delimiters; import lang::rascal::grammar::analyze::Dependencies; import lang::rascal::grammar::analyze::Newlines; +import lang::rascal::grammar::analyze::Symbols; import lang::textmate::ConversionConstants; import lang::textmate::ConversionUnit; import lang::textmate::Grammar; @@ -236,6 +237,8 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) { terminals = [s | s <- terminals, [] != s.symbols]; terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly terminals = dup(terminals); + terminals = sortByMinimumLength(terminals); // Small symbols first + terminals = reverse(terminals); // Large symbols first terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback) TmRule r = toTmRule( From 5603cc861ba6c12a532f91e63773a9e0580c725a Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 6 Sep 2024 11:49:11 +0200 Subject: [PATCH 13/16] Move `removeBeginEnd` to a separate private function to improve readability --- .../main/rascal/lang/textmate/Conversion.rsc | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc index 03ea76f..f884c2a 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc @@ -219,21 +219,10 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) { // Compute a list of segments that need to be consumed between // the `begin` delimiter and the `end` delimiters. Each of these // segments will be converted to a match pattern. - list[Segment] segments = [*getSegments(rsc, u.prod) | u <- group]; - - Segment removeBeginEnd(Segment seg) { - list[Symbol] symbols = seg.symbols; - if (seg.initial, _ <- symbols, symbols[0] == begin) { - symbols = symbols[1..]; - } - if (seg.final, _ <- symbols, symbols[-1] in ends) { - symbols = symbols[..-1]; - } - - return seg[symbols = symbols]; - } - - list[Symbol] terminals = [\seq(removeBeginEnd(seg).symbols) | seg <- segments]; + set[Segment] segs = {*getSegments(rsc, u.prod) | u <- group}; + segs = {removeBeginEnd(seg, begins, ends) | seg <- segs}; + + list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs]; terminals = [s | s <- terminals, [] != s.symbols]; terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly terminals = dup(terminals); @@ -305,6 +294,18 @@ private list[ConversionUnit] addOuterRules(list[ConversionUnit] units) { // precision than a unit-driven approach; I suspect it might. } +private Segment removeBeginEnd(Segment seg, set[Symbol] begins, set[Symbol] ends) { + list[Symbol] symbols = seg.symbols; + if (seg.initial, _ <- symbols, symbols[0] in begins) { + symbols = symbols[1..]; + } + if (seg.final, _ <- symbols, symbols[-1] in ends) { + symbols = symbols[..-1]; + } + + return seg[symbols = symbols]; +} + // TODO: This function could be moved to a separate, generic module private list[&T] dupLast(list[&T] l) = reverse(dup(reverse(l))); // TODO: Optimize/avoid `reverse`-ing? From dafaa08014c7626a0b0a6ff5604b9e8591c86453 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 6 Sep 2024 11:50:48 +0200 Subject: [PATCH 14/16] Fix small issue in the escaping rules for strings in `PicoWithCategories` --- .../textmate/conversiontests/PicoWithCategories.rsc | 4 ++-- .../textmate/conversiontests/PicoWithCategories.test | 12 +++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc index 9a81068..6737a91 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc @@ -47,8 +47,8 @@ lexical Natural = [0-9]+ !>> [0-9]; lexical String = "\"" Char* "\""; lexical Char - = ![\"] - | "\\" [\"]; + = ![\\\"] + | "\\" [\\\"]; keyword Keyword = "begin" diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test index df5915e..8b33821 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test @@ -64,9 +64,19 @@ bar" # ^^^^ string.quoted.double + "foo\" +# ^^^^^^ string.quoted.double + bar" +# ^^^^ string.quoted.double + "foo # ^^^^ string.quoted.double \" # ^^ string.quoted.double bar" -# ^^^^ string.quoted.double \ No newline at end of file +# ^^^^ string.quoted.double + + "foo +# ^^^^ string.quoted.double + \"bar" +# ^^^^^^ string.quoted.double \ No newline at end of file From ed8b6b2952b23d7ad7f07636f8fe71b16d61a7e3 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 6 Sep 2024 12:25:40 +0200 Subject: [PATCH 15/16] Update generated TextMate grammar for Rascal/Pico --- .../syntaxes/pico.tmLanguage.json | 6 ++--- .../syntaxes/rascal.tmLanguage.json | 26 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/vscode-extension/syntaxes/pico.tmLanguage.json b/vscode-extension/syntaxes/pico.tmLanguage.json index a489f46..527e06e 100644 --- a/vscode-extension/syntaxes/pico.tmLanguage.json +++ b/vscode-extension/syntaxes/pico.tmLanguage.json @@ -14,7 +14,7 @@ "end": "(\\\")", "patterns": [ { - "match": "((?:\\\\)(?:\\\"))", + "match": "((?:\\\\)[\\\"\\\\])", "captures": { "1": { "name": "string.quoted.double" @@ -22,7 +22,7 @@ } }, { - "match": "([\\x{01}-\\!\\#-\\x{10FFFF}])", + "match": "([\\x{01}-\\!\\#-\\[\\]-\\x{10FFFF}])", "captures": { "1": { "name": "string.quoted.double" @@ -197,7 +197,7 @@ } }, "/inner/single/expression.strcon": { - "match": "((?<=(?:[\\t-\\n\\r\\x{20}\\%]|(?:(?:^))))(?:(?:\\\")(?:(?:(?:\\\\)(?:\\\"))|[\\x{01}-\\!\\#-\\x{10FFFF}])*?(?:\\\")))", + "match": "((?<=(?:[\\t-\\n\\r\\x{20}\\%]|(?:(?:^))))(?:(?:\\\")(?:(?:(?:\\\\)[\\\"\\\\])|[\\x{01}-\\!\\#-\\[\\]-\\x{10FFFF}])*?(?:\\\")))", "name": "/inner/single/expression.strcon", "captures": { "1": { diff --git a/vscode-extension/syntaxes/rascal.tmLanguage.json b/vscode-extension/syntaxes/rascal.tmLanguage.json index 4095f9d..c462663 100644 --- a/vscode-extension/syntaxes/rascal.tmLanguage.json +++ b/vscode-extension/syntaxes/rascal.tmLanguage.json @@ -229,7 +229,7 @@ } }, { - "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])", + "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", "captures": { "1": { "name": "string.quoted.double" @@ -237,7 +237,7 @@ } }, { - "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])", + "match": "((?:\\\\)a[0-7][0-9A-Fa-f])", "captures": { "1": { "name": "string.quoted.double" @@ -253,7 +253,7 @@ } }, { - "match": "((?:\\\\)a[0-7][0-9A-Fa-f])", + "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])", "captures": { "1": { "name": "string.quoted.double" @@ -261,7 +261,7 @@ } }, { - "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", + "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])", "captures": { "1": { "name": "string.quoted.double" @@ -302,7 +302,7 @@ } }, { - "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])", + "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", "captures": { "1": { "name": "string.quoted.double" @@ -310,7 +310,7 @@ } }, { - "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])", + "match": "((?:\\\\)a[0-7][0-9A-Fa-f])", "captures": { "1": { "name": "string.quoted.double" @@ -318,7 +318,7 @@ } }, { - "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))", + "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])", "captures": { "1": { "name": "string.quoted.double" @@ -326,7 +326,7 @@ } }, { - "match": "((?:\\\\)a[0-7][0-9A-Fa-f])", + "match": "((?:\\n)[\\t\\x{20}\\x{A0}\\x{1680}\\x{2000}-\\x{200A}\\x{202F}\\x{205F}\\x{3000}]*?(?:\\'))", "captures": { "1": { "name": "string.quoted.double" @@ -334,7 +334,7 @@ } }, { - "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", + "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])", "captures": { "1": { "name": "string.quoted.double" @@ -557,7 +557,7 @@ } }, { - "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])", + "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", "captures": { "1": { "name": "string.quoted.single" @@ -565,7 +565,7 @@ } }, { - "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])", + "match": "((?:\\\\)a[0-7][0-9A-Fa-f])", "captures": { "1": { "name": "string.quoted.single" @@ -581,7 +581,7 @@ } }, { - "match": "((?:\\\\)a[0-7][0-9A-Fa-f])", + "match": "((?:\\\\)[\\\"\\'\\<\\>\\\\bfnrt])", "captures": { "1": { "name": "string.quoted.single" @@ -589,7 +589,7 @@ } }, { - "match": "((?:\\\\)u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])", + "match": "([\\x{01}-\\!\\#-\\&\\(-\\;\\=\\?-\\[\\]-\\x{10FFFF}])", "captures": { "1": { "name": "string.quoted.single" From 27c26ef2e69fdd834cd1de85d49a99e642a64868 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 6 Sep 2024 15:01:24 +0200 Subject: [PATCH 16/16] Improve documentation --- rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc b/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc index bd34f33..d5244e9 100644 --- a/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc +++ b/rascal-textmate-core/src/main/rascal/util/MaybeUtil.rsc @@ -17,8 +17,8 @@ set[&T] unmaybe(Maybe[set[&T]] _: just(set[&T] \set)) = \set; @synopsis{ - Returns just the union of the sets of two `Maybe` values when present. - Returns nothing if absent. + Returns just the union of the sets of two `Maybe` values when both are + present. Returns nothing if at least one is absent. } Maybe[set[&T]] union(just(set[&T] set1), just(set[&T] set2))