Skip to content

Commit

Permalink
MySQL AST Parser
Browse files Browse the repository at this point in the history
  • Loading branch information
adamziel committed Aug 17, 2024
1 parent f4212a3 commit ccc341b
Show file tree
Hide file tree
Showing 22 changed files with 403,896 additions and 0 deletions.
8 changes: 8 additions & 0 deletions custom-parser/grammar-factoring/1-ebnf-to-json.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import { Grammars, Parser } from 'ebnf';
import fs from 'fs';

const filePath = process.argv[2] || 'MySQLFull.ebnf';
let grammar = fs.readFileSync(filePath, 'utf8');
let RULES = Grammars.W3C.getRules(grammar);

console.log(JSON.stringify(RULES, null, 2));
80 changes: 80 additions & 0 deletions custom-parser/grammar-factoring/2-cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import json
import sys
import argparse
from ebnfutils import eliminate_left_recursion, encode_as_ebnf, factor_common_prefixes

class CustomArgumentParser(argparse.ArgumentParser):
def error(self, message):
self.print_help(sys.stderr)
self.exit(2, f"{self.prog}: error: {message}\n")

parser = CustomArgumentParser(description="Processes the parser grammar.")

# Add the mode positional argument
parser.add_argument(
'mode',
type=str,
choices=['lr', 'cp', 'all'],
help=(
'Specify the mode. Options are:\n'
"* 'lr' for left recursion elimination\n"
"* 'cp' for factoring common prefixes\n"
"* 'all' for both\n"
)
)

# Add the filename positional argument
parser.add_argument(
'filename',
type=str,
help='Specify the filename.'
)

# Add the format argument (optional flag)
parser.add_argument(
'--format',
type=str,
choices=['json', 'ebnf'],
default='json',
required=False,
help='Specify the output format. Options are: json, ebnf.'
)

# Parse the arguments
args = parser.parse_args()

# Print the parsed values
# print(f"Selected format: {args.format}")
# print(f"Selected mode: {args.mode}")
# print(f"Filename: {args.filename}")

if args.filename is None or args.mode not in ["lr", "cp", "all"]:
print("Usage: python ebnf-to-right-recursive.py <mode> <filename> [--format json|ebnf]")
print("Mode can be one of:")
print("* 'lr' for left recursion elimination")
print("* 'cp' for factoring common prefixes")
print("* 'all' for both")
print("")
print("Filename is the path to the JSON file containing the parsed EBNF grammar")
print("")
sys.exit(1)

try:
with open(args.filename) as fp:
input_grammar = json.load(fp)
except Exception as e:
print(e, file=sys.stderr)
print(f"Failed to load grammar from {args.filename}", file=sys.stderr)
sys.exit(1)

updated_grammar = input_grammar
if args.mode == "lr" or args.mode == "all":
updated_grammar = eliminate_left_recursion(updated_grammar)

# if args.mode == "cp" or args.mode == "all":
# updated_grammar = factor_common_prefixes(updated_grammar, passes=1)

if args.format == "json":
print(json.dumps(updated_grammar, indent=2))
else:
print(encode_as_ebnf(updated_grammar))
71 changes: 71 additions & 0 deletions custom-parser/grammar-factoring/3-phpize-grammar.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
<?php

if($argc < 2) {
echo "Usage: php $argv[0] <grammar.json>\n";
exit(1);
}

function export_as_php_var($var) {
if(is_array($var)) {
$array_notation = "[";
$keys = array_keys($var);
$last_key = end($keys);
$export_keys = json_encode(array_keys($var)) !== json_encode(range(0, count($var) - 1));
foreach($var as $key => $value) {
if($export_keys) {
$array_notation .= var_export($key, true) . "=>";
}
$array_notation .= export_as_php_var($value);
if($key !== $last_key) {
$array_notation .= ",";
}
}
$array_notation .= "]";
return $array_notation;
}
return var_export($var, true);
}

$grammar = json_decode(file_get_contents($argv[1]), true);
require_once __DIR__ . '/../parser/MySQLLexer.php';

// Lookup tables
$rules_offset = 2000;
$rule_id_by_name = [];
$rule_index_by_name = [];
foreach ($grammar as $rule) {
$rules_ids[] = $rule["name"];
$rule_index_by_name[$rule["name"]] = (count($rules_ids) - 1);
$rule_id_by_name[$rule["name"]] = $rule_index_by_name[$rule["name"]] + $rules_offset;
$compressed_grammar[$rule["name"]] = [];
}

// Convert rules ids and token ids to integers
$compressed_grammar = [];
foreach($grammar as $rule) {
$new_branches = [];
foreach($rule["bnf"] as $branch) {
$new_branch = [];
foreach($branch as $i => $name) {
$is_terminal = !isset($rule_id_by_name[$name]);
if($is_terminal) {
$new_branch[] = MySQLLexer::getTokenId($name);
} else {
// Use rule id to avoid conflicts with token ids
$new_branch[] = $rule_id_by_name[$name];
}
}
$new_branches[] = $new_branch;
}
// Use rule index
$compressed_grammar[$rule_index_by_name[$rule["name"]]] = $new_branches;
}

$full_grammar = [
"rules_offset" => $rules_offset,
"rules_names" => $rules_ids,
"grammar" => $compressed_grammar
];

$php_array = export_as_php_var($full_grammar);
echo "<?php\nreturn " . $php_array . ";";
Loading

0 comments on commit ccc341b

Please sign in to comment.