Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exhaustive MySQL Parser #157

Open
wants to merge 28 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ccc341b
MySQL AST Parser
adamziel Aug 17, 2024
78fdf69
Fix parser overriding parts of the parse tree as it constructs them.
adamziel Aug 17, 2024
c8652d5
Output ParseTree using a class, not an array for much simpler processing
adamziel Aug 17, 2024
137d6ca
Manually factor left recursion into right recursion in the grammar fi…
adamziel Aug 18, 2024
0a2440c
Explore support for SQL_CALC_FOUND_ROWS
adamziel Aug 20, 2024
0406d71
Support VALUES() call
adamziel Aug 20, 2024
87573f2
Extract queries from MySQL test suite and test the parser against them
JanJakes Sep 26, 2024
9629702
Implement handling for manually added lexer symbols
JanJakes Sep 26, 2024
d63bc6e
Fix passing nulls to "ctype_" functions
JanJakes Sep 26, 2024
8e7e2e8
Add support for hex format x'ab12', X'ab12', and bin format x'01' and…
JanJakes Sep 26, 2024
ebcc17e
Fix wrong MySQL version conditions (AI hallucinations)
JanJakes Sep 26, 2024
1551b0e
Implement the checkCharset() placeholder function
JanJakes Sep 26, 2024
cdd84b4
Document manual grammar factoring
JanJakes Sep 26, 2024
f50b515
Fix "alterOrderList" that has a wrong definition in the original grammar
JanJakes Sep 26, 2024
e267f67
Fix "createUser" that was incorrectly converted from ANTLR to EBNF
JanJakes Sep 26, 2024
cd543af
Fix "castType" that was incomplete in the original grammar
JanJakes Sep 26, 2024
135f29f
Fix "SELECT ... WHERE ... INTO @var" using a negative lookahead
JanJakes Sep 26, 2024
27524dd
Fix "EXPLAIN FORMAT=..." by reordering grammar rules
JanJakes Sep 27, 2024
069342f
Fix special "WINDOW" and "OVER" cases by adjusting grammar rules
JanJakes Sep 27, 2024
9bfc977
Fix "GRANT" and "REVOKE" by adjusting grammar rules to solve conflicts
JanJakes Sep 27, 2024
ca4de77
Use ebnfutils to dump grammar conflicts
JanJakes Sep 27, 2024
cd3504d
Implement the determineFunction() placeholder function, unify SQL modes
JanJakes Sep 30, 2024
81bbde0
Fix processing NOW() synonyms in lexer
JanJakes Sep 30, 2024
71292fb
Match mysqltest commands case-insensitively
JanJakes Sep 30, 2024
1ab3723
Add a script to test lexer on all the testing queries
JanJakes Oct 1, 2024
42ffc1b
Replace lexer switch/case and function calls with lookup tables
JanJakes Oct 1, 2024
01241b8
Fix unicode handling when extracting test queries
JanJakes Oct 2, 2024
3d9671b
Fix identifier matching, improve lexer performance by ~25%
JanJakes Oct 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions custom-parser/grammar-factoring/1-ebnf-to-json.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import { Grammars, Parser } from 'ebnf';
import fs from 'fs';

const filePath = process.argv[2] || 'MySQLFull.ebnf';
let grammar = fs.readFileSync(filePath, 'utf8');
grammar = grammar.replaceAll(/\/\*[\s\S]*?\*\/$/gm, ''); // remove comments (the "ebnf" package fails on some)
grammar = grammar.replaceAll('%', 'fragment__F')
let RULES = Grammars.W3C.getRules(grammar);
console.log(JSON.stringify(RULES, null, 2).replaceAll('fragment__F', '%'));
85 changes: 85 additions & 0 deletions custom-parser/grammar-factoring/2-cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import json
import sys
import argparse
from ebnfutils import eliminate_left_recursion, encode_as_ebnf, factor_common_prefixes, expand_grammar

class CustomArgumentParser(argparse.ArgumentParser):
def error(self, message):
self.print_help(sys.stderr)
self.exit(2, f"{self.prog}: error: {message}\n")

parser = CustomArgumentParser(description="Processes the parser grammar.")

# Add the mode positional argument
parser.add_argument(
'mode',
type=str,
choices=['lr', 'expand', 'cp', 'all'],
help=(
'Specify the mode. Options are:\n'
"* 'lr' for left recursion elimination\n"
"* 'cp' for factoring common prefixes\n"
"* 'all' for both\n"
)
)

# Add the filename positional argument
parser.add_argument(
'filename',
type=str,
help='Specify the filename.'
)

# Add the format argument (optional flag)
parser.add_argument(
'--format',
type=str,
choices=['json', 'ebnf'],
default='json',
required=False,
help='Specify the output format. Options are: json, ebnf.'
)

# Parse the arguments
args = parser.parse_args()

# Print the parsed values
# print(f"Selected format: {args.format}")
# print(f"Selected mode: {args.mode}")
# print(f"Filename: {args.filename}")

if args.filename is None or args.mode not in ["expand", "lr", "cp", "all"]:
print("Usage: python ebnf-to-right-recursive.py <mode> <filename> [--format json|ebnf]")
print("Mode can be one of:")
print("* 'expand' for expansion of * ? + symbols")
print("* 'lr' for left recursion elimination")
print("* 'cp' for factoring common prefixes")
print("* 'all' for both")
print("")
print("Filename is the path to the JSON file containing the parsed EBNF grammar")
print("")
sys.exit(1)

try:
with open(args.filename) as fp:
input_grammar = json.load(fp)
except Exception as e:
print(e, file=sys.stderr)
print(f"Failed to load grammar from {args.filename}", file=sys.stderr)
sys.exit(1)

updated_grammar = input_grammar
if args.mode == "expand" or args.mode == "all":
grammar, new_rules = expand_grammar(updated_grammar)
updated_grammar = grammar

if args.mode == "lr" or args.mode == "all":
updated_grammar = eliminate_left_recursion(updated_grammar)

# if args.mode == "cp" or args.mode == "all":
# updated_grammar = factor_common_prefixes(updated_grammar, passes=1)

if args.format == "json":
print(json.dumps(updated_grammar, indent=2))
else:
print(encode_as_ebnf(updated_grammar))
86 changes: 86 additions & 0 deletions custom-parser/grammar-factoring/3-phpize-grammar.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
<?php

if($argc < 2) {
echo "Usage: php $argv[0] <grammar.json>\n";
exit(1);
}

function export_as_php_var($var) {

Check warning on line 8 in custom-parser/grammar-factoring/3-phpize-grammar.php

View workflow job for this annotation

GitHub Actions / Check code style

It is recommended not to use reserved keyword "var" as function parameter name. Found: $var
if(is_array($var)) {
$array_notation = "[";
$keys = array_keys($var);

Check warning on line 11 in custom-parser/grammar-factoring/3-phpize-grammar.php

View workflow job for this annotation

GitHub Actions / Check code style

Equals sign not aligned with surrounding assignments; expected 11 spaces but found 1 space
$last_key = end($keys);

Check warning on line 12 in custom-parser/grammar-factoring/3-phpize-grammar.php

View workflow job for this annotation

GitHub Actions / Check code style

Equals sign not aligned with surrounding assignments; expected 7 spaces but found 1 space
$export_keys = json_encode(array_keys($var)) !== json_encode(range(0, count($var) - 1));

Check warning on line 13 in custom-parser/grammar-factoring/3-phpize-grammar.php

View workflow job for this annotation

GitHub Actions / Check code style

Equals sign not aligned with surrounding assignments; expected 4 spaces but found 1 space
foreach($var as $key => $value) {
if($export_keys) {
$array_notation .= var_export($key, true) . "=>";
}
$array_notation .= export_as_php_var($value);
if($key !== $last_key) {
$array_notation .= ",";
}
}
$array_notation .= "]";
return $array_notation;
}
return var_export($var, true);
}

$grammar = json_decode(file_get_contents($argv[1]), true);
require_once __DIR__ . '/../parser/MySQLLexer.php';

// Lookup tables
$rules_offset = 2000;
$rule_id_by_name = [];
$rule_index_by_name = [];
foreach ($grammar as $rule) {
$rules_ids[] = $rule["name"];
$rule_index_by_name[$rule["name"]] = (count($rules_ids) - 1);
$rule_id_by_name[$rule["name"]] = $rule_index_by_name[$rule["name"]] + $rules_offset;
$compressed_grammar[$rule["name"]] = [];
}

// Convert rules ids and token ids to integers
$compressed_grammar = [];
foreach($grammar as $rule) {
$new_branches = [];
foreach($rule["bnf"] as $branch) {
$new_branch = [];
foreach($branch as $i => $name) {
$is_terminal = !isset($rule_id_by_name[$name]);
if($is_terminal) {
$new_branch[] = MySQLLexer::getTokenId($name);
} else {
// Use rule id to avoid conflicts with token ids
$new_branch[] = $rule_id_by_name[$name];
}
}
$new_branches[] = $new_branch;
}
// Use rule index
$compressed_grammar[$rule_index_by_name[$rule["name"]]] = $new_branches;
}

// Compress the fragment rules names – they take a lot of disk space and are
// inlined in the final parse tree anyway.
$last_fragment = 1;
foreach($rules_ids as $id => $name) {
if(
$name[0] === '%' ||
str_ends_with($name, '_zero_or_one') ||
str_ends_with($name, '_zero_or_more') ||
str_ends_with($name, '_one_or_more')
) {
$rules_ids[$id] = '%f' . $last_fragment;
++$last_fragment;
}
}

$full_grammar = [
"rules_offset" => $rules_offset,
"rules_names" => $rules_ids,
"grammar" => $compressed_grammar
];

$php_array = export_as_php_var($full_grammar);
echo "<?php\nreturn " . $php_array . ";";
Loading
Loading