Skip to content

Commit

Permalink
Merge pull request #15 from SWAT-engineering/identify-newline-separat…
Browse files Browse the repository at this point in the history
…ed-segments

Identify newline separated segments
  • Loading branch information
sungshik authored Sep 6, 2024
2 parents c7ed45e + 27c26ef commit 17df6e6
Show file tree
Hide file tree
Showing 9 changed files with 417 additions and 46 deletions.
20 changes: 8 additions & 12 deletions rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,6 @@ bool tryParse(Grammar g, Symbol s, str input, bool allowAmbiguity = false) {
return false;
}

@synopsis{
Gets the terminals that occur in production `p`, possibly recursively
(default: `true`)
}

set[Symbol] getTerminals(Grammar g, Production p, bool recur = true)
= {s | s <- p.symbols, !isNonTerminalType(s)}
+ {*getTerminals(g, child) | recur, s <- p.symbols, child <- lookup(g, s)};

@synopsis{
Lookups a list of productions for symbol `s` in grammar `g`, replacing
formal parameters with actual parameters when needed
Expand Down Expand Up @@ -84,21 +75,26 @@ Symbol expand(\iter-star-seps(symbol, separators))
Removes the label from symbol `s`, if any
}
Symbol delabel(label(_, Symbol s)) = s;
default Symbol delabel(Symbol s) = s;
Symbol delabel(\label(_, Symbol s)) = delabel(s);
default Symbol delabel(Symbol s) = s;
@synopsis{
Removes operators `?` and `*` from symbol `s`, if any
}
Symbol destar(label(name, symbol))
Symbol destar(\label(name, symbol))
= label(name, destar(symbol));
Symbol destar(\opt(symbol))
= destar(symbol);
Symbol destar(\iter-star(symbol))
= \iter(destar(symbol));
Symbol destar(\iter-star-seps(symbol, separators))
= \iter-seps(destar(symbol), separators);
Symbol destar(\seq([symbol]))
= \seq([destar(symbol)]);
Symbol destar(\alt({symbol}))
= \alt({destar(symbol)});
default Symbol destar(Symbol s) = s;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,153 @@ module lang::rascal::grammar::analyze::Newlines
import Grammar;
import ParseTree;
import String;
import util::Maybe;

import lang::rascal::grammar::Util;
import util::MaybeUtil;

@synopsis{
Representation of a *newline-free* segment of symbols. A segment is
*initial* when it occurs first in a production/list of symbols; it is
*final* when it occurs last.
}

data Segment = segment(
list[Symbol] symbols,
bool initial = false,
bool final = false);

@synopsis{
Gets the (newline-free) segments of a production/list of symbols in grammar
`g`, separated by symbols that have a newline (not part of any segment),
recursively for non-terminals. For instance, the segments of
`[lit("foo"), lit("bar"), lit("\n"), lit("baz")]` are:
- `[lit("foo"), lit("bar")]`;
- `[lit("baz")]`.
}

set[Segment] getSegments(Grammar g, Production p) {
return unmaybe(getSegmentsByProduction(g)[p]);
}

set[Segment] getSegments(Grammar g, list[Symbol] symbols) {
map[Production, Maybe[set[Segment]]] env = getSegmentsByProduction(g);
return unmaybe(getSegmentsWithEnvironment(g, symbols, env));
}

@memo
private map[Production, Maybe[set[Segment]]] getSegmentsByProduction(Grammar g) {
map[Production, Maybe[set[Segment]]] ret = (p : nothing() | /p: prod(_, _, _) := g);
solve (ret) {
for (p <- ret, nothing() == ret[p]) {
ret[p] = getSegmentsWithEnvironment(g, p.symbols, ret);
}
}
return ret;
}
private Maybe[set[Segment]] getSegmentsWithEnvironment(
Grammar g, list[Symbol] symbols,
map[Production, Maybe[set[Segment]]] env) {
// General idea: Recursively traverse `symbols` from left to right, while
// keeping track of a "running segment" (initially empty). Each time a
// symbol that has a newline is encountered, finish/collect the running
// segment, and start a new one for the remainder of `symbols`.
// Base case: No symbols remaining
Maybe[set[Segment]] get(Segment running, [], bool final = true) {
return just(_ <- running.symbols ? {running[final = final]} : {});
}
// Recursive case: At least one symbol remaining
Maybe[set[Segment]] get(Segment running, [Symbol head, *Symbol tail]) {
set[Symbol] nested = {s | /Symbol s := head};
Maybe[set[Segment]] finished = get(running, [], final = tail == []);
// If the head contains a non-terminal, then: (1) finish the running
// segment; (2) lookup the segments of the non-terminals in the
// environment, if any; (3) compute the segments of the tail. Return the
// union of 1-3.
if (any(s <- nested, isNonTerminalType(s))) {
list[Maybe[set[Segment]]] sets = [];
// (1)
sets += finished;
// (2)
sets += for (s <- nested, isNonTerminalType(s), p <- lookup(g, s)) {
bool isInitial(Segment seg)
= seg.initial && running.initial && running.symbols == [];
bool isFinal(Segment seg)
= seg.final && tail == [];
Segment update(Segment seg)
= seg[initial = isInitial(seg)][final = isFinal(seg)];
append just(segs) := env[p] ? just({update(seg) | seg <- segs}) : nothing();
}
// (3)
sets += get(segment([]), tail);
// Return union
return (sets[0] | union(it, \set) | \set <- sets[1..]);
}
// If the head doesn't contain a non-terminal, but it has a newline,
// then: (1) finish the running segment; (2) compute the segments of the
// tail. Return the union of 1-2. Note: the head, as it has a newline,
// is ignored and won't be part of any segment.
else if (any(s <- nested, hasNewline(g, s))) {
return union(finished, get(segment([]), tail));
}
// If the head doesn't contain a non-terminal, and if it doesn't have a
// newline, then add the head to the running segment and proceed with
// the tail.
else {
Segment old = running;
Segment new = old[symbols = old.symbols + head];
return get(new, tail);
}
}
return get(segment([], initial = true), symbols);
}
@synopsis{
Checks if a symbol has a newline character
}
bool hasNewline(Grammar g, Symbol s) {
return any(p <- lookup(g, delabel(s)), hasNewline(g, p));
}
@synopsis{
Checks if a production has a newline character
}
bool hasNewline(Grammar g, prod(_, symbols, _)) {
set[Symbol] nonTerminals = {s | /Symbol s := symbols, isNonTerminalType(s)};
return any(/r: range(_, _) := symbols, hasNewline(r)) ||
any(s <- nonTerminals, Production p <- lookup(g, s), hasNewline(g, p));
bool hasNewline(Grammar g, Production p) {
return hasNewlineByProduction(g)[p];
}
@memo
private map[Production, bool] hasNewlineByProduction(Grammar g) {
map[Production, bool] ret = (p: false | /p: prod(_, _, _) := g);
solve (ret) {
for (p <- ret, !ret[p]) {
set[Symbol] nonTerminals = {s | /Symbol s := p.symbols, isNonTerminalType(s)};
ret[p] = ret[p] || any(/r: range(_, _) := p.symbols, hasNewline(r))
|| any(s <- nonTerminals, Production child <- lookup(g, s), ret[child]);
}
}
return ret;
}
@synopsis{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@ module lang::rascal::grammar::analyze::Symbols

import Grammar;
import ParseTree;
import String;
import util::Math;
import util::Maybe;

import lang::rascal::grammar::Util;
import util::MaybeUtil;

@synopsis{
Representation of a traversal direction along a list of symbols
Expand Down Expand Up @@ -55,9 +58,9 @@ private map[Symbol, Maybe[set[Symbol]]] firstBySymbol(Grammar g, bool(Symbol) pr
Maybe[set[Symbol]] firstOf([])
= just({});
Maybe[set[Symbol]] firstOf([h, *t])
Maybe[set[Symbol]] firstOf([Symbol h, *Symbol t])
= \set: just({\empty(), *_}) := ret[delabel(h)]
? union(\set, firstOf(t))
? util::MaybeUtil::union(\set, firstOf(t))
: ret[delabel(h)];
solve (ret) {
Expand Down Expand Up @@ -112,19 +115,61 @@ private map[Symbol, Maybe[set[Symbol]]] followBySymbol(Grammar g, bool(Symbol) p
return ret;
}
private set[Symbol] unmaybe(just(set[Symbol] \set))
= \set;
private set[Symbol] unmaybe(nothing())
= {};
private Maybe[set[Symbol]] union(just(set[Symbol] \set1), just(set[Symbol] \set2))
= just(\set1 + \set2);
private default Maybe[set[Symbol]] union(Maybe[set[Symbol]] _, Maybe[set[Symbol]] _)
= nothing();
@synopsis{
Checks if symbol `s` is a terminal
}
bool isTerminal(Symbol s)
= !isNonTerminalType(s);
= !isNonTerminalType(s);
@synposis{
Sorts list of terminals `symbols` by minimum length (in ascending order)
}
list[Symbol] sortByMinimumLength(list[Symbol] symbols) {
bool less(Symbol s1, Symbol s2) = length(s1).min < length(s2).min;
return sort(symbols, less);
}
@synopsis{
Representation of the minimum length and the maximum length of the text
produced by a symbol. If `max` is `nothing()`, then the text produced is
statically unbounded.
}
alias Range = tuple[int min, Maybe[int] max];
private Range ZERO = <0, just(0)>;
private Range seq(Range r1, Range r2) = <r1.min + r2.min, add(r1.max, r2.max)>;
private Range alt(Range r1, Range r2) = <min(r1.min, r2.min), max(r1.max, r2.max)>;
private Maybe[int] add(just(int i), just(int j)) = just(i + j);
private default Maybe[int] add(Maybe[int] _, Maybe[int] _) = nothing();
private Maybe[int] max(just(int i), just(int j)) = just(max(i, j));
private default Maybe[int] max(Maybe[int] _, Maybe[int] _) = nothing();
@synopsis{
Computes the length of a terminal symbol as a range
}
Range length(\lit(string)) = <size(string), just(size(string))>;
Range length(\cilit(string)) = <size(string), just(size(string))>;
Range length(\char-class(_)) = <1, just(1)>;
Range length(\empty()) = ZERO;
Range length(\opt(symbol)) = length(symbol)[min = 0];
Range length(\iter(symbol)) = length(symbol)[max = issue2007];
Range length(\iter-star(symbol)) = <0, max: just(0) := length(symbol).max ? max : nothing()>;
Range length(\iter-seps(symbol, _)) = length(symbol)[max = issue2007];
Range length(\iter-star-seps(symbol, _)) = <0, max: just(0) := length(symbol).max ? max : nothing()>;
Range length(\alt(alternatives)) = {Symbol first, *Symbol rest} := alternatives
? (length(first) | alt(it, length(s)) | s <- rest)
: ZERO;
Range length(\seq(symbols)) = (ZERO | seq(it, length(s)) | s <- symbols);
Range length(\conditional(symbol, _)) = length(symbol);
// TODO: Remove this workaround when issue #2007 is fixed:
// - https://github.com/usethesource/rascal/issues/2007
private Maybe[int] issue2007 = nothing();
26 changes: 22 additions & 4 deletions rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import lang::rascal::grammar::Util;
import lang::rascal::grammar::analyze::Delimiters;
import lang::rascal::grammar::analyze::Dependencies;
import lang::rascal::grammar::analyze::Newlines;
import lang::rascal::grammar::analyze::Symbols;
import lang::textmate::ConversionConstants;
import lang::textmate::ConversionUnit;
import lang::textmate::Grammar;
Expand Down Expand Up @@ -215,13 +216,18 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
// Simple case: each unit does have an `end` inner delimiter
if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) {
// Compute a list of terminals that need to be consumed between
// Compute a list of segments that need to be consumed between
// the `begin` delimiter and the `end` delimiters. Each of these
// terminals will be converted to a match pattern.
list[Symbol] terminals = [*getTerminals(rsc, u.prod) | u <- group];
terminals = [s | s <- terminals, s notin begins && s notin ends];
// segments will be converted to a match pattern.
set[Segment] segs = {*getSegments(rsc, u.prod) | u <- group};
segs = {removeBeginEnd(seg, begins, ends) | seg <- segs};
list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
terminals = [s | s <- terminals, [] != s.symbols];
terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
terminals = dup(terminals);
terminals = sortByMinimumLength(terminals); // Small symbols first
terminals = reverse(terminals); // Large symbols first
terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
TmRule r = toTmRule(
Expand Down Expand Up @@ -288,6 +294,18 @@ private list[ConversionUnit] addOuterRules(list[ConversionUnit] units) {
// precision than a unit-driven approach; I suspect it might.
}
private Segment removeBeginEnd(Segment seg, set[Symbol] begins, set[Symbol] ends) {
list[Symbol] symbols = seg.symbols;
if (seg.initial, _ <- symbols, symbols[0] in begins) {
symbols = symbols[1..];
}
if (seg.final, _ <- symbols, symbols[-1] in ends) {
symbols = symbols[..-1];
}
return seg[symbols = symbols];
}
// TODO: This function could be moved to a separate, generic module
private list[&T] dupLast(list[&T] l)
= reverse(dup(reverse(l))); // TODO: Optimize/avoid `reverse`-ing?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@ syntax Expression

lexical Id = ([a-z][a-z0-9]*) !>> [a-z0-9] \ Keyword;
lexical Natural = [0-9]+ !>> [0-9];
lexical String = "\"" ![\"]* "\"";
lexical String = "\"" Char* "\"";

lexical Char
= ![\\\"]
| "\\" [\\\"];

keyword Keyword
= "begin"
Expand All @@ -70,7 +74,7 @@ lexical WhitespaceAndComment
Grammar rsc = preprocess(grammar(#Program));

list[ConversionUnit] units = [
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":=")})],{}), false, <nothing(),nothing()>, <nothing(),nothing()>),
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":="),lit("\\")})],{}), false, <nothing(),nothing()>, <nothing(),nothing()>),
unit(rsc, prod(label("natural",sort("Type")),[lit("natural")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
unit(rsc, prod(label("nil",sort("Type")),[lit("nil-type")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <just(lit("nil-type")),just(lit("nil-type"))>),
unit(rsc, prod(label("string",sort("Type")),[lit("string")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
Expand Down
Loading

0 comments on commit 17df6e6

Please sign in to comment.