WordPress · adamziel · Aug 17, 2024 · Aug 17, 2024 · Aug 17, 2024 · Aug 18, 2024
diff --git a/custom-parser/grammar-factoring/1-ebnf-to-json.js b/custom-parser/grammar-factoring/1-ebnf-to-json.js
@@ -0,0 +1,9 @@
+import { Grammars, Parser } from 'ebnf';
+import fs from 'fs';
+
+const filePath = process.argv[2] || 'MySQLFull.ebnf';
+let grammar = fs.readFileSync(filePath, 'utf8');
+grammar = grammar.replaceAll(/\/\*[\s\S]*?\*\/$/gm, ''); // remove comments (the "ebnf" package fails on some)
+grammar = grammar.replaceAll('%', 'fragment__F')
+let RULES = Grammars.W3C.getRules(grammar);
+console.log(JSON.stringify(RULES, null, 2).replaceAll('fragment__F', '%'));
diff --git a/custom-parser/grammar-factoring/2-cli.py b/custom-parser/grammar-factoring/2-cli.py
@@ -0,0 +1,85 @@
+import json
+import sys
+import argparse
+from ebnfutils import eliminate_left_recursion, encode_as_ebnf, factor_common_prefixes, expand_grammar
+
+class CustomArgumentParser(argparse.ArgumentParser):
+    def error(self, message):
+        self.print_help(sys.stderr)
+        self.exit(2, f"{self.prog}: error: {message}\n")
+
+parser = CustomArgumentParser(description="Processes the parser grammar.")
+
+# Add the mode positional argument
+parser.add_argument(
+    'mode',
+    type=str,
+    choices=['lr', 'expand', 'cp', 'all'],
+    help=(
+        'Specify the mode. Options are:\n'
+        "* 'lr' for left recursion elimination\n"
+        "* 'cp' for factoring common prefixes\n"
+        "* 'all' for both\n"
+    )
+)
+
+# Add the filename positional argument
+parser.add_argument(
+    'filename',
+    type=str,
+    help='Specify the filename.'
+)
+
+# Add the format argument (optional flag)
+parser.add_argument(
+    '--format',
+    type=str,
+    choices=['json', 'ebnf'],
+    default='json',
+    required=False,
+    help='Specify the output format. Options are: json, ebnf.'
+)
+
+# Parse the arguments
+args = parser.parse_args()
+
+# Print the parsed values
+# print(f"Selected format: {args.format}")
+# print(f"Selected mode: {args.mode}")
+# print(f"Filename: {args.filename}")
+
+if args.filename is None or args.mode not in ["expand", "lr", "cp", "all"]:
+    print("Usage: python ebnf-to-right-recursive.py <mode> <filename> [--format json|ebnf]")
+    print("Mode can be one of:")
+    print("* 'expand' for expansion of * ? + symbols")
+    print("* 'lr' for left recursion elimination")
+    print("* 'cp' for factoring common prefixes")
+    print("* 'all' for both")
+    print("")
+    print("Filename is the path to the JSON file containing the parsed EBNF grammar")
+    print("")
+    sys.exit(1)
+
+try:
+    with open(args.filename) as fp:
+        input_grammar = json.load(fp)
+except Exception as e:
+    print(e, file=sys.stderr)
+    print(f"Failed to load grammar from {args.filename}", file=sys.stderr)
+    sys.exit(1)
+
+updated_grammar = input_grammar
+if args.mode == "expand" or args.mode == "all":
+    grammar, new_rules = expand_grammar(updated_grammar)
+    updated_grammar = grammar
+
+if args.mode == "lr" or args.mode == "all":
+    updated_grammar = eliminate_left_recursion(updated_grammar)
+
+# if args.mode == "cp" or args.mode == "all":
+#     updated_grammar = factor_common_prefixes(updated_grammar, passes=1)
+
+if args.format == "json":
+    print(json.dumps(updated_grammar, indent=2))
+else:
+    print(encode_as_ebnf(updated_grammar))
diff --git a/custom-parser/grammar-factoring/3-phpize-grammar.php b/custom-parser/grammar-factoring/3-phpize-grammar.php
@@ -0,0 +1,86 @@
+<?php
+
+if($argc < 2) {
+    echo "Usage: php $argv[0] <grammar.json>\n";
+    exit(1);
+}
+
+function export_as_php_var($var) {
+    if(is_array($var)) {
+        $array_notation = "[";
+        $keys = array_keys($var);
+        $last_key = end($keys);
+        $export_keys = json_encode(array_keys($var)) !== json_encode(range(0, count($var) - 1));
+        foreach($var as $key => $value) {
+            if($export_keys) {
+                $array_notation .= var_export($key, true) . "=>";
+            }
+            $array_notation .= export_as_php_var($value);
+            if($key !== $last_key) {
+                $array_notation .= ",";
+            }
+        }
+        $array_notation .= "]";
+        return $array_notation;
+    }
+    return var_export($var, true);
+}
+
+$grammar = json_decode(file_get_contents($argv[1]), true);
+require_once __DIR__ . '/../parser/MySQLLexer.php';
+
+// Lookup tables
+$rules_offset = 2000;
+$rule_id_by_name = [];
+$rule_index_by_name = [];
+foreach ($grammar as $rule) {
+    $rules_ids[] = $rule["name"];
+    $rule_index_by_name[$rule["name"]] = (count($rules_ids) - 1);
+    $rule_id_by_name[$rule["name"]] = $rule_index_by_name[$rule["name"]] + $rules_offset;
+    $compressed_grammar[$rule["name"]] = [];
+}
+
+// Convert rules ids and token ids to integers
+$compressed_grammar = [];
+foreach($grammar as $rule) {
+    $new_branches = [];
+    foreach($rule["bnf"] as $branch) {
+        $new_branch = [];
+        foreach($branch as $i => $name) {
+            $is_terminal = !isset($rule_id_by_name[$name]);
+            if($is_terminal) {
+                $new_branch[] = MySQLLexer::getTokenId($name);
+            } else {
+                // Use rule id to avoid conflicts with token ids
+                $new_branch[] = $rule_id_by_name[$name];
+            }
+        }
+        $new_branches[] = $new_branch;
+    }
+    // Use rule index
+    $compressed_grammar[$rule_index_by_name[$rule["name"]]] = $new_branches;
+}
+
+// Compress the fragment rules names – they take a lot of disk space and are
+// inlined in the final parse tree anyway.
+$last_fragment = 1;
+foreach($rules_ids as $id => $name) {
+    if(
+        $name[0] === '%' || 
+        str_ends_with($name, '_zero_or_one') || 
+        str_ends_with($name, '_zero_or_more') || 
+        str_ends_with($name, '_one_or_more')
+    ) {
+        $rules_ids[$id] = '%f' . $last_fragment;
+        ++$last_fragment;
+    }
+}
+
+$full_grammar = [
+    "rules_offset" => $rules_offset,
+    "rules_names" => $rules_ids,
+    "grammar" => $compressed_grammar
+];
+
+$php_array = export_as_php_var($full_grammar);
+echo "<?php\nreturn " . $php_array . ";";