Skip to content

Commit

Permalink
Merge pull request #17 from SWAT-engineering/recursive-multiline-high…
Browse files Browse the repository at this point in the history
…lighting2

Recursive multiline highlighting
  • Loading branch information
sungshik authored Sep 9, 2024
2 parents 17df6e6 + e8a887c commit 8557e38
Show file tree
Hide file tree
Showing 19 changed files with 724 additions and 302 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,22 @@ bool tryParse(Grammar g, Symbol s, str input, bool allowAmbiguity = false) {
return false;
}

@synopsis{
Checks if symbol `s` is recursive in grammar `g`
}

bool isRecursive(Grammar g, Symbol s) {
set[Symbol] getChildren(Symbol s)
= {s | p <- lookup(g, s), /Symbol s := p.symbols};
bool check(set[Symbol] checking, Symbol s)
= s in checking
? true
: any(child <- getChildren(s), check(checking + s, child));
return check({}, s);
}
@synopsis{
Lookups a list of productions for symbol `s` in grammar `g`, replacing
formal parameters with actual parameters when needed
Expand Down Expand Up @@ -96,10 +112,20 @@ Symbol destar(\seq([symbol]))
Symbol destar(\alt({symbol}))
= \alt({destar(symbol)});
Symbol destar(\conditional(symbol, conditions))
= \conditional(destar(symbol), conditions);
default Symbol destar(Symbol s) = s;
@synopsis{
Retain from set `symbols` each symbol that is a strict prefix of any other
Removes the conditional from symbol `s`, if any
}
Symbol decond(\conditional(Symbol s, _)) = decond(s);
default Symbol decond(Symbol s) = s;
@synopsis{
Retains from set `symbols` each symbol that is a strict prefix of any other
symbol in `symbols`
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,28 @@ data Direction // Traverse lists of symbols (in productions)...
list[&T] reorder(list[&T] l, forward()) = l;
list[&T] reorder(list[&T] l, backward()) = reverse(l);

@synopsis{
Gets the unique leftmost delimiter (`begin`) and the unique rightmost
delimiter `end`, if any, that occur **inside** productions of symbol `s`
(when `s` is a non-terminal) or `s` itself (when `s` is a delimiter). If
`getOnlyFirst` is `true` (default: `false`), then only the first (resp.
last) symbol of the productions can be considered as leftmost (resp.
rightmost).
}

DelimiterPair getInnerDelimiterPair(Grammar g, Symbol s, bool getOnlyFirst = false) {
s = delabel(s);
if (isDelimiter(s)) {
return <just(s), just(s)>;
} else if (isNonTerminalType(s)) {
Maybe[Symbol] begin = getInnerDelimiterBySymbol(g, forward(), getOnlyFirst = getOnlyFirst)[s];
Maybe[Symbol] end = getInnerDelimiterBySymbol(g, backward(), getOnlyFirst = getOnlyFirst)[s];
return <begin, end>;
} else {
return <nothing(), nothing()>;
}
}

@synopsis{
Gets the unique leftmost delimiter (`begin`) and the unique rightmost
delimiter (`end`), if any, that occur **inside** production `p` in grammar
Expand Down Expand Up @@ -60,7 +82,7 @@ list[&T] reorder(list[&T] l, backward()) = reverse(l);
}

DelimiterPair getInnerDelimiterPair(Grammar g, Production p, bool getOnlyFirst = false) {
Maybe[Symbol] begin = getInnerDelimiterByProduction(g, forward() , getOnlyFirst = getOnlyFirst)[p];
Maybe[Symbol] begin = getInnerDelimiterByProduction(g, forward(), getOnlyFirst = getOnlyFirst)[p];
Maybe[Symbol] end = getInnerDelimiterByProduction(g, backward(), getOnlyFirst = getOnlyFirst)[p];
return <begin, end>;
}
Expand All @@ -79,6 +101,7 @@ private map[Production, Maybe[Symbol]] getInnerDelimiterByProduction(Grammar g,
for (p <- ret, ret[p] == nothing()) {
for (s <- reorder(p.symbols, direction)) {
s = delabel(s);
s = decond(s);
if (isDelimiter(s)) {
ret[p] = just(s);
break;
Expand Down
150 changes: 122 additions & 28 deletions rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ RscGrammar preprocess(RscGrammar rsc) {
// Replace occurrences of singleton ranges with just the corresponding
// literal. This makes it easier to identify delimiters.
return visit (rsc) {
case s: \char-class([range(char, char)]) => d
case \char-class([range(char, char)]) => d
when d := \lit("<stringChar(char)>"), isDelimiter(d)
}
}
Expand Down Expand Up @@ -113,12 +113,10 @@ list[ConversionUnit] analyze(RscGrammar rsc) {
// Analyze dependencies among productions
println("[LOG] Analyzing dependencies among productions");
Dependencies dependencies = deps(toGraph(rsc));
list[Production] prods = dependencies
.removeProds(isCyclic, true) // `true` means "also remove ancestors"
.retainProds(isNonEmpty)
.retainProds(hasCategory)
.getProds();
Graph[Production] graph = toGraph(rsc);
list[Production] prods = deps(graph).retainProds(isNonEmpty).retainProds(hasCategory).getProds();
list[Production] prodsNonRecursive = prods & deps(graph).removeProds(isCyclic, true).getProds();
list[Production] prodsRecursive = prods - prodsNonRecursive;
// Analyze delimiters
println("[LOG] Analyzing delimiters");
Expand All @@ -134,13 +132,15 @@ list[ConversionUnit] analyze(RscGrammar rsc) {
list[Production] prodsKeywords = [prod(lex(KEYWORDS_PRODUCTION_NAME), [\alt(keywords)], {\tag("category"("keyword.control"))})];
// Return
bool isEmptyProd(prod(_, [\alt(alternatives)], _)) = alternatives == {};
list[ConversionUnit] units
= [unit(rsc, p, hasNewline(rsc, p), getOuterDelimiterPair(rsc, p), getInnerDelimiterPair(rsc, p, getOnlyFirst = true)) | p <- prods]
+ [unit(rsc, p, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsDelimiters, !isEmptyProd(p)]
+ [unit(rsc, p, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsKeywords, !isEmptyProd(p)];
return sort(units);
bool isRecursive(Production p)
= p in prodsRecursive;
bool isEmptyProd(prod(_, [\alt(alternatives)], _))
= alternatives == {};
set[ConversionUnit] units = {};
units += {unit(rsc, p, isRecursive(p), hasNewline(rsc, p), getOuterDelimiterPair(rsc, p), getInnerDelimiterPair(rsc, p, getOnlyFirst = true)) | p <- prods};
units += {unit(rsc, p, false, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsDelimiters + prodsKeywords, !isEmptyProd(p)};
return sort([*removeStrictPrefixes(units)]);
}
@synopsis{
Expand Down Expand Up @@ -196,7 +196,7 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
// Convert all units in the group to match patterns (including,
// optimistically, multi-line units as-if they are single-line)
for (u <- group) {
for (u <- group, !u.recursive) {
TmRule r = toTmRule(toRegExp(u.rsc, u.prod, guard = true))
[name = "/inner/single/<u.name>"];
Expand All @@ -216,32 +216,116 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
// Simple case: each unit does have an `end` inner delimiter
if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) {
// Compute a list of segments that need to be consumed between
// Compute a set of segments that need to be consumed between
// the `begin` delimiter and the `end` delimiters. Each of these
// segments will be converted to a match pattern.
set[Segment] segs = {*getSegments(rsc, u.prod) | u <- group};
segs = {removeBeginEnd(seg, begins, ends) | seg <- segs};
list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
terminals = [s | s <- terminals, [] != s.symbols];
terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
terminals = dup(terminals);
terminals = sortByMinimumLength(terminals); // Small symbols first
terminals = reverse(terminals); // Large symbols first
terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
TmRule r = toTmRule(
toRegExp(rsc, [begin], {t}),
toRegExp(rsc, [\alt(ends)], {t}),
[toTmRule(toRegExp(rsc, [s], {t})) | s <- terminals])
[toTmRule(toRegExp(rsc, [s], {t})) | s <- toTerminals(segs)])
[name = "/inner/multi/<intercalate(",", [u.name | u <- group])>"];
rules = insertIn(rules, (u: r | u <- group));
}
// Complex case: some unit doesn't have an `end` inner delimiter
// Complex case: some unit doesn't have an `end` inner delimiter.
// This requires (substantial) extra care, as there is no obvious
// marker to close the begin/end pattern with.
else {
; // TODO (part of future support for *recursive* multi-line units)
Decomposition decomposition = decompose([*group]);
// TODO: The following condition can be true (even though there
// has to be a `begin` delimiter) because `decompose` doesn't
// expand non-terminals. Consider if it should, to maybe improve
// accuracy.
if ([] == decomposition.prefix) {
continue;
}
RegExp reBegin = toRegExp(rsc, decomposition.prefix, {t});
RegExp reEnd = regExp("(?=.)", []);
patterns = for (suffix <- decomposition.suffixes) {
if (just(Symbol begin) := getInnerDelimiterPair(rsc, suffix[0], getOnlyFirst = true).begin) {
if (just(Symbol end) := getInnerDelimiterPair(rsc, suffix[-1], getOnlyFirst = true).end) {
// If the suffix has has both a `begin` delimiter
// and an `end` delimiter, then generate a
// begin/end pattern to highlight these delimiters
// and all content in between.
set[Segment] segs = getSegments(rsc, suffix);
segs = {removeBeginEnd(seg, {begin}, {end}) | seg <- segs};
append toTmRule(
toRegExp(rsc, [begin], {t}),
toRegExp(rsc, [end], {t}),
[toTmRule(toRegExp(rsc, [s], {t})) | s <- toTerminals(segs)]);
}
else {
// If the suffix has a `begin` delimiter, but not
// an `end` delimiter, then generate a match pattern
// just to highlight that `begin` delimiter. Ignore
// the remainder of the suffix (because it's
// recursive, so no regular expression can be
// generated for it).
append toTmRule(toRegExp(rsc, [begin], {t}));
}
}
else {
// If the suffix doesn't have a `begin` delimiter, then
// ignore it (because it's recursive, so no regular
// expression can be generated for it).
;
}
}
TmRule r = toTmRule(reBegin, reEnd, patterns);
r = r[name = "/inner/multi/<intercalate(",", [u.name | u <- group])>"];
r = r[applyEndPatternLast = true];
rules = insertIn(rules, (u: r | u <- group));
// TODO: The current approach produces "partially"
// newline-sensitive rules, in the sense that newlines are
// accepted between the prefix and the suffixes, but not between
// symbols in the prefix. This approach could be improved to
// produce "totally" newline-sensitive rules (at the cost of
// much more complicated rule generation and generated rules) by
// adopting an approach in which the rules for each symbol in
// the prefix looks something like the following three:
//
// ```
// "foo": {
// "name": "foo",
// "begin": "(\\@)",
// "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
// "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }],
// "contentName": "comment",
// "beginCaptures": { "1": { "name": "comment" } }
// },
// "foo.$": {
// "begin": "$",
// "end": "(?<=^.+)|(?:(?!$)(?![a-z]+))",
// "name": "foo.$",
// "patterns": [ { "include": "#foo.^" }]
// },
// "foo.^": {
// "begin": "^",
// "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
// "name": "foo.^",
// "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }]
// }
// ```
//
// Note: This alternative approach would likely render the
// present distinction between the "simple case" and the
// "complex case" unneeded, so in that sense, rule generation
// would actually become simpler.
}
}
}
Expand Down Expand Up @@ -302,10 +386,20 @@ private Segment removeBeginEnd(Segment seg, set[Symbol] begins, set[Symbol] ends
if (seg.final, _ <- symbols, symbols[-1] in ends) {
symbols = symbols[..-1];
}
return seg[symbols = symbols];
}
private list[Symbol] toTerminals(set[Segment] segs) {
list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
terminals = [s | s <- terminals, [] != s.symbols];
terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
terminals = dup(terminals);
terminals = sortByMinimumLength(terminals); // Small symbols first
terminals = reverse(terminals); // Large symbols first
terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
return terminals;
}
// TODO: This function could be moved to a separate, generic module
private list[&T] dupLast(list[&T] l)
= reverse(dup(reverse(l))); // TODO: Optimize/avoid `reverse`-ing?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ bool doAnalyzeTest(RscGrammar rsc, list[ConversionUnit] expect, bool printActual
println();
for (i <- [0..size(actual)]) {
ConversionUnit u = actual[i];
println(" unit(rsc, <toStr(u.prod)>, <u.multiLine>, <u.outerDelimiters>, <u.innerDelimiters>)<i < size(actual) - 1 ? "," : "">");
println(" unit(rsc, <toStr(u.prod)>, <u.recursive>, <u.multiLine>, <u.outerDelimiters>, <u.innerDelimiters>)<i < size(actual) - 1 ? "," : "">");
}
println();
}
Expand Down
Loading

0 comments on commit 8557e38

Please sign in to comment.