diff --git a/bootstrap/bin/dune b/bootstrap/bin/hmc/dune similarity index 100% rename from bootstrap/bin/dune rename to bootstrap/bin/hmc/dune diff --git a/bootstrap/bin/hmc.ml b/bootstrap/bin/hmc/hmc.ml similarity index 89% rename from bootstrap/bin/hmc.ml rename to bootstrap/bin/hmc/hmc.ml index 0b38ac213..b8c062f3f 100644 --- a/bootstrap/bin/hmc.ml +++ b/bootstrap/bin/hmc/hmc.ml @@ -36,10 +36,9 @@ let scan_file path = () let _ = - match Array.length Sys.argv with + match Array.length Os.argv with | 0L | 1L -> halt "hmc usage: hmc " | _ -> begin - let path_str = Array.get 1L Sys.argv in - let path = Path.of_string path_str in + let path = Path.of_bytes (Bytes.Slice.init (Array.get 1L Os.argv)) in scan_file path end diff --git a/bootstrap/bin/hocc/conf.ml b/bootstrap/bin/hocc/conf.ml new file mode 100644 index 000000000..79fdcc144 --- /dev/null +++ b/bootstrap/bin/hocc/conf.ml @@ -0,0 +1,187 @@ +open! Basis +include Basis.Rudiments + +type algorithm = + | LR1Compact + | LR1Canonical + +let pp_algorithm algorithm formatter = + formatter |> Fmt.fmt (match algorithm with + | LR1Compact -> "LR1Compact" + | LR1Canonical -> "LR1Canonical" + ) + +type language = + | Hemlock + | OCaml + +let pp_language language formatter = + formatter |> Fmt.fmt (match language with + | Hemlock -> "Hemlock" + | OCaml -> "OCaml" + ) + +type t = { + verbose: bool; + report: bool; + graph: bool; + algorithm: algorithm; + language: language; + srcdir_opt: Path.t option; + module_opt: Path.Segment.t option; + dstdir_opt: Path.t option; +} + +let pp {verbose; report; graph; algorithm; language; srcdir_opt; module_opt; dstdir_opt} formatter = + formatter + |> Fmt.fmt "{verbose=" |> Bool.pp verbose + |> Fmt.fmt "; report=" |> Bool.pp report + |> Fmt.fmt "; graph=" |> Bool.pp graph + |> Fmt.fmt "; algorithm=" |> pp_algorithm algorithm + |> Fmt.fmt "; language=" |> pp_language language + |> Fmt.fmt "; srcdir_opt=" |> (Option.pp Path.pp) srcdir_opt + |> Fmt.fmt "; module_opt=" |> (Option.pp Path.Segment.pp) module_opt + |> Fmt.fmt "; dstdir_opt=" |> (Option.pp Path.pp) dstdir_opt + |> Fmt.fmt "}" + +let default = { + verbose=false; + report=false; + graph=false; + algorithm=LR1Compact; + language=Hemlock; + srcdir_opt=None; + module_opt=None; + dstdir_opt=None; +} + +let usage error = + let exit_code, formatter = match error with + | false -> 0, File.Fmt.stdout + | true -> 1, File.Fmt.stderr + in + formatter + |> Fmt.fmt "\ +hocc usage: hocc + +Options: + -h[elp] : Print command usage and exit. + -v[erbose] : Print verbose progress information during parser + generation. + -r[eport] : Write a detailed automoton description to + /.report . + -g[raph] : Write a graph of the precedence relationships in Graphviz + dot format to /.dot . + -c[anonical] : Generate a canonical LR(1) parser rather than a compact + LR(1) parser. + -ocaml : Generate OCaml output rather than Hemlock output. This is + brittle functionality intended only for Hemlock + bootstrapping. + -s[rc] : Path and module name of input source, where inputs match + .hmh[i] and comprises the source directory and + module name, [/] . +-d[stdir] : Path to directory in which to place generated output, such + that output file paths match + /.{hm,hmi,report,dot,ml,mli} . Defaults to + . +" + |> ignore; + Stdlib.exit exit_code + +let of_argv argv = + let arg_arg argv i = begin + let i' = succ i in + match i' < Array.length argv with + | false -> begin + let arg = Bytes.to_string_replace (Array.get i argv) in + File.Fmt.stderr |> Fmt.fmt "hocc: " |> Fmt.fmt arg |> Fmt.fmt " argument missing\n" + |> ignore; + usage true + end + | true -> Array.get i' argv + end in + let rec f t argv i = begin + match i < Array.length argv with + | false -> t + | true -> begin + let arg_bytes = Array.get i argv in + let arg_string = Bytes.to_string_replace arg_bytes in + match arg_string with + | "-help" | "-h" -> usage false + | "-verbose" | "-v" -> f {t with verbose=true} argv (succ i) + | "-report" | "-r" -> f {t with report=true} argv (succ i) + | "-graph" | "-g" -> f {t with graph=true} argv (succ i) + | "-canonical" | "-c" -> f {t with algorithm=LR1Canonical} argv (succ i) + | "-ocaml" -> f {t with language=OCaml} argv (succ i) + | "-src" | "-s" -> begin + let path = Path.of_bytes (Bytes.Slice.init (arg_arg argv i)) in + let dirname, basename_opt = Path.split path in + let srcdir_opt = match Path.is_empty dirname with + | true -> None + | false -> Some dirname + in + let module_opt = match basename_opt with + | None -> begin + File.Fmt.stderr + |> Fmt.fmt "hocc: Invalid source: " + |> Path.pp path + |> Fmt.fmt "\n" + |> ignore; + usage true + end + | Some m -> Some m + in + f {t with srcdir_opt; module_opt} argv (i + 2L) + end + | "-dstdir" | "-d" -> begin + let dstdir = Path.of_bytes (Bytes.Slice.init (arg_arg argv i)) in + f {t with dstdir_opt=Some dstdir} argv (i + 2L) + end + | _ -> begin + File.Fmt.stderr + |> Fmt.fmt "hocc: Invalid command line parameter: " + |> String.pp arg_string + |> Fmt.fmt "\n" + |> ignore; + usage true + end + end + end in + let t = f default argv 1L in + (* XXX Verify that module name is cident. *) + match t.module_opt with + | None -> begin + File.Fmt.stderr |> Fmt.fmt "hocc: Source not specified\n" |> ignore; + usage true + end + | Some _ -> t + +let verbose {verbose; _} = + verbose + +let report {report; _} = + report + +let graph {graph; _} = + graph + +let algorithm {algorithm; _} = + algorithm + +let language {language; _} = + language + +let srcdir {srcdir_opt; _} = + match srcdir_opt with + | None -> Path.of_string "." + | Some srcdir -> srcdir + +let module_ {module_opt; _} = + match module_opt with + | None -> not_reached () + | Some m -> m + +let dstdir {dstdir_opt; _} = + match dstdir_opt with + | None -> Path.of_string "." + | Some dstdir -> dstdir diff --git a/bootstrap/bin/hocc/conf.mli b/bootstrap/bin/hocc/conf.mli new file mode 100644 index 000000000..45262fc49 --- /dev/null +++ b/bootstrap/bin/hocc/conf.mli @@ -0,0 +1,28 @@ +open Basis + +type algorithm = + | LR1Compact + | LR1Canonical + +val pp_algorithm: algorithm -> (module Fmt.Formatter) -> (module Fmt.Formatter) + +type language = + | Hemlock + | OCaml + +val pp_language : language -> (module Fmt.Formatter) -> (module Fmt.Formatter) + +type t + +include FormattableIntf.SMono with type t := t + +val of_argv: Bytes.t array -> t + +val verbose: t -> bool +val report: t -> bool +val graph: t -> bool +val algorithm: t -> algorithm +val language: t -> language +val srcdir: t -> Path.t +val module_: t -> Path.Segment.t +val dstdir: t -> Path.t diff --git a/bootstrap/bin/hocc/dune b/bootstrap/bin/hocc/dune new file mode 100644 index 000000000..15f70f33e --- /dev/null +++ b/bootstrap/bin/hocc/dune @@ -0,0 +1,7 @@ +(executables + (names hocc) + (libraries Basis Hmc)) + +(install + (section bin) + (files (hocc.exe as hocc))) diff --git a/bootstrap/bin/hocc/hocc.ml b/bootstrap/bin/hocc/hocc.ml new file mode 100644 index 000000000..67cc01941 --- /dev/null +++ b/bootstrap/bin/hocc/hocc.ml @@ -0,0 +1,49 @@ +open Basis +include Basis.Rudiments +open Hmc + +let scan_file path = + let rec fn scanner = begin + let scanner', ctok = Scan.next scanner in + let atok = Scan.ConcreteToken.atok ctok in + let source = Scan.ConcreteToken.source ctok in + File.Fmt.stdout + |> Fmt.fmt " " + |> Source.Slice.pp source + |> Fmt.fmt " : " + |> Scan.AbstractToken.pp atok + |> Fmt.fmt "\n" + |> ignore; + match atok with + | Scan.AbstractToken.Tok_end_of_input -> () + | _ -> fn scanner' + end in + let () = match File.of_path path with + | Ok f -> begin + let stream = File.Stream.of_file f in + let text = Text.of_bytes_stream ~path stream in + let scanner = Scan.init text in + fn scanner + end + | Error err -> halt ( + String.Fmt.empty + |> Fmt.fmt "File.of_path error: " + |> Fmt.fmt (File.Error.to_string err) + |> Fmt.fmt "\n" + |> Fmt.to_string + ) + in + () + +let _ = + let conf = Conf.of_argv Os.argv in + File.Fmt.stdout |> Fmt.fmt "XXX hocc: conf=" |> Conf.pp conf |> Fmt.fmt "\n" |> ignore; + let path = Path.join [ + (Conf.srcdir conf); + Path.of_string "/"; + Path.of_segment (Path.Segment.join [ + (Conf.module_ conf); + Option.value_hlt Path.(basename (of_string ".hmh")) + ]); + ] in + scan_file path diff --git a/bootstrap/src/basis/entropy.ml b/bootstrap/src/basis/entropy.ml index c566095a3..6fd59262e 100644 --- a/bootstrap/src/basis/entropy.ml +++ b/bootstrap/src/basis/entropy.ml @@ -11,6 +11,6 @@ let get () = | _ -> halt "Entropy.get error: Entropy acquisition failure" let seed = - match Sys.getenv_opt "HEMLOCK_ENTROPY" with + match Stdlib.Sys.getenv_opt "HEMLOCK_ENTROPY" with | None -> get () | Some hemlock_entropy -> u128_of_string hemlock_entropy diff --git a/bootstrap/src/basis/os.ml b/bootstrap/src/basis/os.ml new file mode 100644 index 000000000..bdbf35b3b --- /dev/null +++ b/bootstrap/src/basis/os.ml @@ -0,0 +1,2 @@ +let argv = Array.map Stdlib.Sys.argv ~f:(fun arg -> + Bytes.of_string_slice (String.C.Slice.of_string arg)) diff --git a/bootstrap/src/basis/os.mli b/bootstrap/src/basis/os.mli new file mode 100644 index 000000000..9f52ce162 --- /dev/null +++ b/bootstrap/src/basis/os.mli @@ -0,0 +1,5 @@ +(** Operating system interfaces. *) + +val argv: Bytes.t array +(** [argv] comprises the command line arguments, where the first element is the path to the program + being executed. *) diff --git a/bootstrap/test/basis/seed/test_seed0.ml b/bootstrap/test/basis/seed/test_seed0.ml index f8e1b1e91..ad4899332 100644 --- a/bootstrap/test/basis/seed/test_seed0.ml +++ b/bootstrap/test/basis/seed/test_seed0.ml @@ -3,7 +3,7 @@ open Basis let () = File.Fmt.stdout |> Fmt.fmt "HEMLOCK_ENTROPY=" - |> String.pp (Sys.getenv "HEMLOCK_ENTROPY") + |> String.pp (Stdlib.Sys.getenv "HEMLOCK_ENTROPY") |> Fmt.fmt " -> seed=" |> Hash.State.pp Hash.State.seed |> Fmt.fmt "\n" diff --git a/bootstrap/test/basis/seed/test_seed42.ml b/bootstrap/test/basis/seed/test_seed42.ml index f8e1b1e91..ad4899332 100644 --- a/bootstrap/test/basis/seed/test_seed42.ml +++ b/bootstrap/test/basis/seed/test_seed42.ml @@ -3,7 +3,7 @@ open Basis let () = File.Fmt.stdout |> Fmt.fmt "HEMLOCK_ENTROPY=" - |> String.pp (Sys.getenv "HEMLOCK_ENTROPY") + |> String.pp (Stdlib.Sys.getenv "HEMLOCK_ENTROPY") |> Fmt.fmt " -> seed=" |> Hash.State.pp Hash.State.seed |> Fmt.fmt "\n" diff --git a/doc/design/index.md b/doc/design/index.md index 53cb56aeb..bad36e84a 100644 --- a/doc/design/index.md +++ b/doc/design/index.md @@ -1,4 +1,4 @@ -# Hemlock +# Hemlock Design [Hemlock](https://github.com/BranchTaken/Hemlock) is a systems programming language, but not all systems are alike. Hemlock is intentionally constrained to excel for a (large) subset of possible diff --git a/doc/index.md b/doc/index.md new file mode 100644 index 000000000..0e970b794 --- /dev/null +++ b/doc/index.md @@ -0,0 +1,5 @@ +# Hemlock Documentation + +- [Design](design/index.md) +- Tools + + [`hocc`](tools/hocc.md) diff --git a/doc/tools/hocc.md b/doc/tools/hocc.md new file mode 100644 index 000000000..21df5fb09 --- /dev/null +++ b/doc/tools/hocc.md @@ -0,0 +1,722 @@ +# hocc + +`hocc` is an [LR(1) parser generator](https://en.wikipedia.org/wiki/Canonical_LR_parser). Its name +carries on a long tradition, to wit: + +- [`yacc`](https://en.wikipedia.org/wiki/Yacc) stands for "Yet Another Compiler Compiler". Clearly + the name derives from "yack", as in, "Chuck's dinner didn't sit well and he yacked it." +- `hocc` stands for "Hardly Original Compiler Compiler". The name derives from "hock", as in, "Hank + hocked a loogie." + +Both programs interpret high-level human-written parser descriptions and produce output unfit for +human consumption. However `hocc` has several distinguishing features relative to `yacc`, aside from +interoperating with [Hemlock](https://github.com/BranchTaken/Hemlock) rather than +[C](https://en.wikipedia.org/wiki/The_C_Programming_Language). + +- `hocc` generates LR(1) rather than [LALR(1)](https://en.wikipedia.org/wiki/LALR_parser) parsers + using a fast behavior-preserving compaction algorithm[^pager1977] that reduces the state machine + size relative to the canonical LR(1) algorithm[^knuth1965]. +- `hocc`'s precedence facilities are are more precise and easier to use without inadvertently + masking grammar ambiguities. Whereas `yacc` supports only a single linear precedence ordering, + `hocc` supports arbitrarily many disjoint directed acyclic precedence graphs. +- `hocc` supports an automated error recovery algorithm[^diekmann2020] based on minimum-cost repair + sequences. + +## Command usage + +`hocc ` + +Options: + +- `-h[elp]`: Print command usage and exit. +- `-v[erbose]`: Print verbose progress information during parser generation. +- `-r[eport]`: Write a detailed automoton description to `/.report`. +- `-g[raph]`: Write a graph of the precedence relationships in [Graphviz + dot](https://graphviz.org/doc/info/lang.html) format to `/.dot`. +- `-c[anonical]`: Generate a canonical LR(1) parser rather than a compact LR(1) parser. +- `-ocaml`: Generate OCaml output rather than Hemlock output. This is brittle functionality intended + only for Hemlock bootstrapping. +- `-s[rc] `: Path and module name of input source, where inputs match `.hmh[i]` and + `` comprises the source directory and module name, `[/]`. +- `-d[stdir] `: Path to directory in which to place generated output, such that output file + paths match `/.{hm,hmi,report,dot,ml,mli}`. Defaults to ``. + +Syntax errors in the input file may prevent file generation. Specification errors do not prevent +report and graph file generation, but all specification errors must be resolved for parser +generation to succeed. Some syntax errors in the embedded Hemlock code may pass through `hocc` +unnoticed. + +Example invocations: + +- `hocc -src Parser`: Read `Parser.hmh[i]` and generate `Parser.hm[i]`. +- `hocc -verbose -report -graph -src src/Parser -d obj`: Verbosely read `src/Parser.hmh[i]` and + generate `obj/Parser.{hm,hmi,report,dot}`. + +## Grammar + +The `hocc` language grammar is layered onto Hemlock's grammar via the addition of several keywords: + +- Parser: `hocc` +- Symbols: + + [Tokens](#tokens): `token` + + [Non-terminals](#non-terminals): `nonterm`, `start` + + [Productions](#productions): `epsilon` +- [Precedence](#precedence): `prec`, `left`, `right` + +Additionally there are several reserved identifiers that appear in [generated +code](#generated-code-api): + +- The `EPSILON` token identifier is used by the generated parser as the token associated with the + start state at the base of the parser stack. The token remains on the stack until parsing accepts, + so it will be visible to introspection at any intermediate parse state. +- The `Token`, `Nonterm`, and `Symbol` modules comprise the generated parser. The parser + specification may need to refer to `Token.t` when defining non-terminal types, but the other + modules are only of use in application code which utilizes the generated parser. + +### [BNF](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) + +[uident](#uident) ::= \[uncapitalized identifier] + +[cident](#cident) ::= \[capitalized identifier] + +[ident](#ident) ::= [uident](#uident) | [cident](#cident) + +[prec_type](#prec_type) ::= `prec` | `left` | `right` + +[precs_tl](#precs_tl) ::= +
    | [precs_tl](#precs_tl) `;` [uident](#uident) +
    | ε + +[precs](#precs) ::= +
    | `[` [uident](#uident) [precs_tl](#precs_tl) `]` +
    | [uident](#uident) +
    | ε + +[rel](#rel) ::= `<` | `|` | `>` + +[prec_rel](#prec_rel) ::= +
    | [rel](#rel) [precs](#precs) +
    | ε + +[prec_rels](#prec_rels) ::= +
    | [prec_rels](#prec_rels) [prec_rel](#prec_rel) +
    | ε + +[prec](#prec) ::= +[prec_type](#prec_type) [uident](#uident) [prec_rels](#prec_rels) + +[type_module](#type_module) ::= \[payload module] + +[type_type](#type_type) ::= \[type within payload module] + +[type](#type) ::= [type_module](#type_module) `.` [type_type](#type_type) + +[of_type](#of_type) ::= +
    | `of` [type](#type) +
    | ε + +[prec_ref](#prec_ref) ::= +
    | `prec` [uident](#uident) +
    | ε + +[token](#token) ::= +`token` [cident](#cident) [of_type](#of_type) [prec_ref](#prec_ref) + +[rparam](#rparam) ::= [ident](#ident) `:` \[symbol type] + +[rparams](#rparams) ::= +
    | [rparam](#rparam) [rparams](#rparams) +
    | ε + +[prod_pattern](#prod_pattern) ::= +
    | [rparams](#rparams) [prec_ref](#prec_ref) + +[prod_patterns_tl](#prod_patterns_tl) ::= +
    | [prod_patterns](#prod_patterns) `|` [prod_pattern](#prod_pattern) +
    | ε + +[prod_patterns](#prod_patterns) ::= +
    | `|` [prod](#prod) [prod_patterns_tl](#prod_patterns_tl) +
    | [prod](#prod) [prod_patterns_tl](#prod_patterns_tl) + +[prod_body](#prod_body) ::= \[reduce action body] + +[prod](#prod) ::= +
    | [prod_patterns](#prod_patterns) `->` [prod_body](#prod_body) +
    | `epsilon` + +[prods_tl](#prods_tl) ::= +
    | [prods](#prods) `|` [prod](#prod) +
    | ε + +[prods](#prods) ::= +
    | `|` [prod](#prod) [prods_tl](#prods_tl) +
    | [prod](#prod) [prods_tl](#prods_tl) + +[nonterm_type](#nonterm_type) ::= `nonterm` | `start` + +[nonterm](#nonterm) ::= [nonterm_type](#nonterm_type) [cident](#cident) +[of_type](#of_type) [prec_ref](#prec_ref) `::=` [prods](#prods) + +[stmt](#stmt) ::= +
    | [prec](#prec) +
    | [token](#token) +
    | [nonterm](#nonterm) +
    | \[hemlock stmt] + +[stmts](#stmts) ::= +
    | [stmts](#stmts) [stmt](#stmt) +
    | ε + +[hocc](#hocc) ::= `hocc` [stmts](#stmts) + +### Tokens + +XXX + +### Non-terminals + +XXX + +#### Productions + +XXX + +### Precedence + +A parser specification may contain conflicts wherein a parser state encodes multiple valid actions +for one or more inputs. `hocc` refuses to generate parsers which contain unresolved conflicts. +Parser specifications can typically be manually restructured to eliminate conflicts, but often at +the expense of clarity. Precedences provide a mechanism for conflict resolution, i.e. explicit +choice of actions. `hocc` attempts to resolve conflicts based on the precedences assigned to tokens +and productions. + +Each production can specify its precedence, or if all of a non-terminal's productions are to have +the same precedence, the precedence can be more succinctly specified for the non-terminal as a +whole. It is an error to explicitly specify both a non-terminal's precedence and the precedence of +any of its productions. + +Precedences may be defined with any of the following associativities: + +- `prec`: Do not resolve conflicts via associativity. Non-associativity is useful for specifying + precedence-based resolutions without inadvertently masking conflicts. +- `left`: Resolve shift/reduce conflicts by reducing. This induces left associativity, e.g. + `2 + 3 + 4` is parsed as `(2 + 3) + 4`. +- `right`: Resolve shift/reduce conflicts by shifting. This induces right associativity, e.g. + `2 + 3 + 4` is parsed as `2 + (3 + 4)`. All else being equal, prefer left associativity to + minimize intermediate parser state. + +Precedences can be defined via the `prec`, `left`, and `right` statements, and they may optionally +be ordered via `<`, `=`, and/or `>` relationships, irrespective of associativity. The ordering in +e.g. `prec a < b` is equivalent to e.g. `left b > a`, but precedences must be defined before use, so +the choice of appropriate relationship operator depends on declaration order. `prec a = b` declares +precedence ordering equivalence. These precedence relationships are used to compute the transitive +closure of precedence orderings. Precedences with disjoint relationships are uncomparable, i.e. they +have no relative ordering. By default, all tokens and productions have a *lack* of precedence, which +is equivalent to each such token/production being assigned a unique disjoint `prec` precedence. + +Conflicts may occur between two or more actions, of which at least one is a reduce action. Such an +action set induces shift/reduce and/or reduce/reduce conflicts; by construction shift actions never +conflict with each other. Given conflicting actions A and B, A "dominates" B if A is preferred over +B. For conflict resolution to succeed, one action must dominate all other conflicting actions. The +rules for conflict resolution are as follows. If none of the rules apply, conflict resolution fails. + +- If one action has higher precedence than all other actions, that action dominates. +- If a subset of actions has higher precedence than all other actions, and the actions in the + highest-precedence subset have equal associativity, associativity resolves the conflict under any + of the following circumstances: + + `left`: A single reduce action dominates shift actions. + + `right`: Shift actions dominate one or more reduce actions. + +Associativity suffices for resolving simple conflicts as in e.g. `2 + 3 + 4`, so that it is +deterministically parsed as `(2 + 3) + 4` (as in the following example specification) or +`2 + (3 + 4)`. + +```hocc +hocc + left add + token PLUS prec add + token INT of Int.t + nonterm Expr of Int.t ::= + | x:INT -> x + | e0:Expr _:PLUS e1:Expr prec add -> Int.(e0 + e1) +``` + +Precedence is needed to resolve conflicts between productions, e.g. in `2 + 3 * 4`. In the following +parser specification, `MUL` has precedence over `PLUS` due to the precedence relationship +`mul > add`, so `2 + 3 * 4` is parsed as `2 + (3 * 4)`. + +```hocc +hocc + left add + token PLUS prec add + left mul > add + token MUL prec mul + token INT of Int.t + nonterm Expr of Int.t ::= + | x:INT -> x + | e0:Expr _:PLUS e1:Expr prec add -> Int.(e0 + e1) + | e0:Expr _:MUL e1:Expr prec mul -> Int.(e0 + e1) +``` + +Precedence relationships are optional in precedence declarations. Examples follow. + +```hocc +hocc + left a + left b < a + left d < a + left e < [b; d] + left c e # {`<`,`>`} and `=` are mutually redundant here. + # a + # ^^^ + # / | \ + # / | \ + # b===c d + # ^ ^ ^ + # \ | / + # \ | / + # \|/ + # e + + # Precedence and associativity are separate concerns. + right f + right g < [d; f] +``` + +Precedences are bound to tokens, non-terminals, and productions using the optional `prec` reference +clause. Omitting the `prec` reference clause is equivalent to referring to a unique disjoint `prec` +precedence. The following example demonstrates the `prec` clause syntax. + +```hocc +hocc + prec p1 + left p2 + + token FOO prec p1 + + nonterm Bar prec p2 + | FOO + | epsilon + + start Biz + | Bar FOO prec p1 +``` + +## Diagnostics + +XXX + +## Generated code API + +XXX + +## Example + +The following example implements a simple mathematical expression calculator. + +`Example.hmhi`: + +```hocc +open Basis + +# Export the parser API so that alternatives to `calculate` can be implemented. `hocc` expands to a +# module signature. +include hocc + +calulate: string -> zint + [@@doc "Calculate the result of a simple arithmetic expression comprising non-negative integers + and `+`, `-`, `*`, and `/` operators. Tokens must be separated by one or more spaces."] +``` + +`Example.hmh`: + +```hocc +open Basis + +# Specify the parser. `hocc ...` expands to a module implementation, `{ ... }`. +include hocc + left add + token PLUS prec add + token MINUS prec add + nonterm AddOp of Token.t ::= + | _:PLUS -> PLUS + | _:MINUS -> MINUS + + left mul > add + token STAR prec mul + token SLASH prec mul + nonterm MulOp of Token.t ::= + | _:STAR -> STAR + | _:SLASH -> SLASH + + token INT of Zint.t + nonterm Expr of Zint.t ::= + | x:INT -> x + | e0:Expr op:AddOp e1:Expr prec add -> + match op with + | AddOp PLUS -> Zint.(e0 + e1) + | AddOp MINUS -> Zint.(e0 - e1) + | e0:Expr op:MulOp e1:Expr -> + match op with + | MulOp STAR -> Zint.(e0 * e1) + | MulOp SLASH -> Zint.(e0 / e1) + + token EOI + start Answer of Zint.t ::= + | e:Expr _:EOI -> e + +# Tokenize `s`, e.g. "2 + 3 * 4", and append an `EOI` token. +tokenize s = + s |> String.split_rev ~f:(fn cp -> Codepoint.O.(cp = ' ')) + |> List.rev_filter ~f:(fn s -> String.length s <> 0) + |> List.rev_map ~f:fn s -> + let open Token + match s with + | "+" -> PLUS + | "-" -> MINUS + | "*" -> STAR + | "/" -> SLASH + | _ -> INT (Zint.of_string s) + |> List.push Token.EOI + |> List.rev + +# Calculate the result of the arithmetic expression expressed in `s`, e.g. "2 + 3 * 4". +calculate s = + List.fold_until (tokenize s) ~init:PARSER.boi ~f:fn parser tok -> + let parser' = PARSER.Answer.next tok parser + let done = match PARSER.status parser' with + | PARSER.Prefix -> false + | PARSER.Accept _ + | PARSER.Error _ -> true + parser', done + |> function + | PARSER.Accept answer -> answer + | PARSER.Prefix _ -> halt "Partial input" + | PARSER.Error _ -> halt "Parse error" +``` + +To generate Hemlock code from the above inputs, run `hocc -s Example`. + +`Example.hmi`: + +```hemlock +open Basis + +# Export the parser API so that alternatives to `calculate` can be implemented. `hocc` expands to a +# module signature. +include { + Token = { + type t: t = + | PLUS of unit + | MINUS of unit + | STAR of unit + | SLASH of unit + | INT of Zint.t + | EOI of unit + | EPSILON of unit + + pp >e: t -> Fmt.Formatter e >e-> Fmt.Formatter e + } + + Nonterm = { + type t: t = + | AddOp of Token.t + | MulOp of Token.t + | Expr of Zint.t + | Answer of Zint.t + + pp >e: t -> Fmt.Formatter e >e-> Fmt.Formatter e + } + + Symbol = { + type t: t = + | Token of Token.t + | Nonterm of Nonterm.t + + pp >e: t -> Fmt.Formatter e >e-> Fmt.Formatter e + } + + type t: t + type status: status = + | Prefix + | Accept of Symbol.t + | Error of XXX + + status: t -> status + stack: t -> list Symbol.t + + Answer = { + boi: t + next: ?trace:bool -> Token.t -> t -> t + } + } + +calulate: string -> zint + [@@doc "Calculate the result of a simple arithmetic expression comprising non-negative integers + and `+`, `-`, `*`, and `/` operators. Tokens must be separated by one or more spaces."] +``` + +`Example.hm`: + +```hemlock +open Basis + +# Specify the parser. `hocc ...` expands to a module implementation, `{ ... }`. +include { + Token = { + type t: t = + | PLUS of unit + | MINUS of unit + | STAR of unit + | SLASH of unit + | INT of Zint.t + | EOI of unit + | EPSILON of unit + + type_id t = + Type.uns_of_tag t + + hash_fold t state = + state + |> Uns.hash_fold (type_id t) + |> + match t with + | INT v -> |> Zint.hash_fold v + | _ -> Unit.hash_fold () + + cmp t0 t1 = + let open Cmp + match Uns.cmp (type_id t0) (type_id t1) with + | Lt -> Lt + | Eq -> + match t0, t1 with + | INT v0, INT v1 -> Unit.cmp v0 v1 + | _ -> Eq + | Gt -> Gt + + pp t formatter = + formatter |> match t with + | PLUS _ -> "PLUS" + | MINUS _ -> "MINUS" + | STAR _ -> "STAR" + | SLASH _ -> "SLASH" + | INT v -> "INT (%f(^Zint.pp^)(^v^))" + | EOI _ -> "EOI" + | EPSILON _ -> "EPSILON" + } + + Nonterm = { + type t: t = + | AddOp of Token.t + | MulOp of Token.t + | Expr of Zint.t + | Answer of Zint.t + + type_id t = + Type.uns_of_tag t + + hash_fold t state = + state + |> Uns.hash_fold (type_id t) + |> + match t with + | AddOp v -> |> Token.hash_fold v + | MulOp v -> |> Token.hash_fold v + | Expr v -> |> Zint.hash_fold v + | Answer v -> |> Zint.hash_fold v + + cmp t0 t1 = + let open Cmp + match Uns.cmp (type_id t0) (type_id t1) with + | Lt -> Lt + | Eq -> + match t0, t1 with + | AddOp v0, AddOp v1 -> Token.cmp v0 v1 + | MulOp v0, MulOp v1 -> Token.cmp v0 v1 + | Expr v0, Expr v1 -> Zint.cmp v0 v1 + | Answer v0, Answer v1 -> Zint.cmp v0 v1 + | _ -> not_reached () + | Gt -> Gt + + pp t formatter = + formatter |> match t with + | AddOp v -> "AddOp (%f(^Token.pp^)(^v^))" + | MulOp v -> "MulOp (%f(^Token.pp^)(^v^))" + | Expr v -> "Expr (%f(^Zint.pp^)(^v^))" + | Answer v -> "Answer (%f(^Zint.pp^)(^v^))" + } + + Symbol = { + type t: t = + | Token of Token.t + | Nonterm of Nonterm.t + + pp >e: t -> Fmt.Formatter e >e-> Fmt.Formatter e + pp t formatter = + formatter |> match t with + | Token token -> "Token (%f(^Token.pp^)(^token^))" + | Nonterm nonterm -> "Nonterm (%f(^Nonterm.pp^)(^nonterm^))" + } + + Prec = { + type assoc = + | Prec + | Left + | Right + type t: t = { + name: string option + assoc: assoc + uid: uns + } + + hash_fold {uid; _} state = + Uns.hash_fold uid + + cmp t0 t1 = + Uns.cmp t0.uid t1.uid + + pp_assoc assoc formatter = + formatter |> Fmt.fmt (match assoc with + | Prec -> "Prec" + | Left -> "Left" + | Right -> "Right" + + pp {name; assoc; uid} formatter = + formatter |> Fmt.fmt + "{%f(^Option.pp String.pp^)=(^name + ^); %f(^pp_assoc^)=(^assoc + ^); %u=(^uid^)}" + } + + # XXX Input is any of <=>, but final form is map of key -> {equiv,doms}. + PrecRel = { + type t: t = { + prec: Prec.t + equiv: Ordset.t Prec.t Prec + doms: Ordset.t Prec.t Prec + } + + hash_fold {prec; _} state = + Prec.hash_fold prec + + cmp t0 t1 = + Prec.cmp t0.prec t1.prec + + pp {prec; equiv; doms} formatter = + formatter |> Fmt.fmt + "{%f(^Prec.pp^)=(^prec + ^); %f(^Ordset.pp^)=(^equiv + ^); %f(^Ordset.pp^)=(^doms^)}" + + init XXX + } + + Precs = { + type t: t = { + rels: Ordset.t PrecRel.t PrecRel + } + + empty = {rels=Ordset.empty PrecRel} + + init ?name assoc = + let rel = {name; assoc; uid=Ordset.length rels} + let rels' = Ordset.insert rel rels + rel, {rels=rels'} + + default t = + init Prec t + + close t = + # XXX Compute transitive closure of relationships. + not_implemented "XXX" + } + + SymbolSpec = { + type t: t = { + sym: Symbol.t + prec: Prec.t + first: Ordset.t Symbol.t Symbol + follow: Ordset.t Symbol.t Symbol + } + } + +Symbol.t + State = { +XXX + } + + itemsets = [| (* XXX *) |] + action = [| (* XXX *) |] + goto = [| (* XXX *) |] + + type stack_elm: stack_elm = { + sym: Symbol.t + state: uns + } + type t: t = { + stack: list stack_elm + } + type status: status = + | Prefix + | Accept of Symbol.t + | Error of XXX + + status: t -> status + stack: t -> list stack_elm + + Answer = { + boi = {sym=EPSILON; state=0} + next: ?trace:bool -> Token.t -> t -> t + } + + } + +# Tokenize `s`, e.g. "2 + 3 * 4", and append an `EOI` token. +tokenize s = + s |> String.split_rev ~f:(fn cp -> Codepoint.O.(cp = ' ')) + |> List.rev_filter ~f:(fn s -> String.length s <> 0) + |> List.rev_map ~f:fn s -> + let open Token + match s with + | "+" -> PLUS + | "-" -> MINUS + | "*" -> STAR + | "/" -> SLASH + | _ -> INT (Zint.of_string s) + |> List.push Token.EOI + |> List.rev + +# Calculate the result of the arithmetic expression expressed in `s`, e.g. "2 + 3 * 4". +calculate s = + List.fold_until (tokenize s) ~init:PARSER.boi ~f:fn parser tok -> + let parser' = PARSER.Answer.next tok parser + let done = match PARSER.status parser' with + | PARSER.Prefix -> false + | PARSER.Accept _ + | PARSER.Error _ -> true + parser', done + |> function + | PARSER.Accept answer -> answer + | PARSER.Prefix _ -> halt "Partial input" + | PARSER.Error _ -> halt "Parse error" +``` + +XXX integration guidance + +## Citations + +[^knuth1965]: + Donald Knuth, + “On the translation of languages from left to right,” + Information and Control 8 (6), 607–639, July 1965. + +[^pager1977]: + David Pager, + “A Practical General Method for Constructing LR(k) Parsers,” + Acta Informatica 7, 249-268, 1977. + +[^diekmann2020]: + Lukas Diekmann and Laurence Tratt, + “Don't Panic! Better, Fewer, Syntax Errors for LR Parsers,” + 34th European Conference on Object-Oriented Programming (ECOOP 2020), Article No. 6, pages 6:1–6:32. diff --git a/ide/kakoune/hocc.kak b/ide/kakoune/hocc.kak new file mode 100644 index 000000000..8faab7e58 --- /dev/null +++ b/ide/kakoune/hocc.kak @@ -0,0 +1,122 @@ +# https://github.com/BranchTaken/Hemlock +# ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ + +# Detection +# ‾‾‾‾‾‾‾‾‾ + +hook global BufCreate .*\.hmh %{ + set-option buffer filetype hocc +} + +# Initialization +# ‾‾‾‾‾‾‾‾‾‾‾‾‾‾ + +hook global WinSetOption filetype=hocc %{ + require-module hocc + set-option window static_words %opt{hocc_static_words} +} + +hook -group hocc-highlight global WinSetOption filetype=hocc %{ + add-highlighter window/hocc ref hocc + hook -once -always window WinSetOption filetype=.* %{ remove-highlighter window/hocc } +} + +provide-module hocc %{ + +# Highlighters +# ‾‾‾‾‾‾‾‾‾‾‾‾ + + +add-highlighter shared/hocc regions +add-highlighter shared/hocc/code default-region group + +add-highlighter shared/hocc/code/cident regex \b[_]*[A-Z][A-Za-z0-9_']*\b 0:module +add-highlighter shared/hocc/code/uident regex \b[_]*[a-z][A-Za-z0-9_']*\b 0:Default +add-highlighter shared/hocc/code/tab regex \t 0:Error +add-highlighter shared/hocc/code/unaligned regex ^(\ \ )*\ (?![\ ]) 0:Error +add-highlighter shared/hocc/code/unaligned_continue_keyword regex ^(\ \ \ \ )*(and|also|as|else|external|of|or|then|when|with)\b 0:Error +add-highlighter shared/hocc/code/unaligned_continue_punctuation regex ^(\ \ \ \ )*([\x7D\]),!'\\\-+*/%@$<=>\|:.]) 0:Error +add-highlighter shared/hocc/code/unaligned_continue_caret regex ^(\ \ \ \ )*([\^](?![&A-Za-z_])) 0:Error +add-highlighter shared/hocc/code/trailing regex (\ )+$ 0:ExcessWhitespace +add-highlighter shared/hocc/code/interior_multispace regex (?<=\S)(\ ){2,}(?=\S) 0:ExcessWhitespace + +add-highlighter shared/hocc/comment region -recurse \Q(* \Q(* \Q*) fill comment +add-highlighter shared/hocc/line_comment region '#' '\n' fill comment + +add-highlighter shared/hocc/string region ((?]?(\+|_)?#?0?\*\(\^ () fill meta +add-highlighter shared/hocc/string/precision region \%('.')?[<^>]?(\+|_)?#?0?([1-9][0-9]*)?\.=?\*\(\^ () fill meta +add-highlighter shared/hocc/string/fmt region \%('.')?[<^>]?(\+|_)?#?0?([1-9][0-9]*)?(\.=?[1-9][0-9]*)?[bodx]?[mac]?p?f\(\^ () fill meta +add-highlighter shared/hocc/string/value region \%('.')?[<^>]?(\+|_)?#?0?([1-9][0-9]*)?(\.=?[1-9][0-9]*)?[bodx]?[mac]?p?([bnzcs]|([ui](8|16|32|64|128|256|512)?)|(r(32|64)?))([\ ]*[-+*/%@^$<=>|:.][-+*/%@$<=>|:.~?]*[\ ]*)?\(\^ () fill meta + +add-highlighter shared/hocc/string/width_precision region \^\)\.=?\*\(\^ () fill meta +add-highlighter shared/hocc/string/width_fmt region \^\)(\.=?[1-9][0-9]*)?[bodx]?[mac]?p?f\(\^ () fill meta +add-highlighter shared/hocc/string/width_value region \^\)(\.=?[1-9][0-9]*)?[bodx]?[mac]?p?([bnzcs]|([ui](8|16|32|64|128|256|512)?)|(r(32|64)?))([\ ]*[-+*/%@^$<=>|:.][-+*/%@$<=>|:.~?]*[\ ]*)?\(\^ () fill meta +add-highlighter shared/hocc/string/precision_fmt region \^\)[bodx]?[mac]?p?f\(\^ () fill meta +add-highlighter shared/hocc/string/precision_value region \^\)[bodx]?[mac]?p?([bnzcs]|([ui](8|16|32|64|128|256|512)?)|(r(32|64)?))([\ ]*[-+*/%@^$<=>|:.][-+*/%@$<=>|:.~?]*[\ ]*)?\(\^ () fill meta +add-highlighter shared/hocc/string/fmt_value region \^\)([\ ]*[-+*/%@^$<=>|:.][-+*/%@$<=>|:.~?]*[\ ]*)?\(\^ () fill meta + +add-highlighter shared/hocc/string/unprotected region (?|:.~?]*} 0:operator +add-highlighter shared/hocc/code/infix_operator regex %{[-+*/%@^$<=>|:.][-+*/%@^$<=>|:.~?]*} 0:operator + +add-highlighter shared/hocc/code/boolean regex \b(true|false)\b 0:value + +add-highlighter shared/hocc/code/bin_integer regex \b(0b)([_]*[01][01_]*)(([ui](8|16|32|64|128|256|512)?)|[zn])?\b 1:attribute 2:value 3:attribute +add-highlighter shared/hocc/code/oct_integer regex \b(0o)([_]*[0-7][0-7_]*)(([ui](8|16|32|64|128|256|512)?)|[zn])?\b 1:attribute 2:value 3:attribute +add-highlighter shared/hocc/code/hex_integer regex \b(0x)([_]*[0-9a-f][0-9a-f_]*)(([ui](8|16|32|64|128|256|512)?)|[zn])?\b 1:attribute 2:value 3:attribute +add-highlighter shared/hocc/code/integer regex \b(([1-9][0-9_]*)|0[_]*)(([ui](8|16|32|64|128|256|512)?)|[zn])?\b 1:value 3:attribute + +add-highlighter shared/hocc/code/bin_real_dot regex \b(0b)([01][01_]*\.(?!\.)[01_]*(p_*[+\-]?_*[0-9][0-9_]*)?)(r(32|64)?)? 1:attribute 2:value 3:attribute +add-highlighter shared/hocc/code/bin_real_p regex \b(0b)([01][01_]*p_*[+\-]?_*[0-9][0-9_]*)(r(32|64)?)?\b 1:attribute 2:value 3:attribute +add-highlighter shared/hocc/code/bin_real_r regex \b(0b)([01][01_]*)(r(32|64)?)\b 1:attribute 2:value 3:attribute + +add-highlighter shared/hocc/code/oct_real_dot regex \b(0o)([0-7][0-7_]*\.(?!\.)[0-7_]*(p_*[+\-]?_*[0-9][0-9_]*)?)(r(32|64)?)? 1:attribute 2:value 3:attribute +add-highlighter shared/hocc/code/oct_real_p regex \b(0o)([0-7][0-7_]*p_*[+\-]?_*[0-9][0-9_]*)(r(32|64)?)?\b 1:attribute 2:value 3:attribute +add-highlighter shared/hocc/code/oct_real_r regex \b(0o)([0-7][0-7_]*)(r(32|64)?)\b 1:attribute 2:value 3:attribute + +add-highlighter shared/hocc/code/hex_real_dot regex \b(0x)([0-9a-f][0-9a-f_]*\.(?!\.)[0-9a-f_]*(p_*[+\-]?_*[0-9][0-9_]*)?)(r(32|64)?)? 1:attribute 2:value 3:attribute +add-highlighter shared/hocc/code/hex_real_p regex \b(0x)([0-9a-f][0-9a-f_]*p_*[+\-]?_*[0-9][0-9_]*)(r(32|64)?)?\b 1:attribute 2:value 3:attribute +add-highlighter shared/hocc/code/hex_real_r regex \b(0x)([0-9a-f][0-9a-f_]*)(r(32|64)?)\b 1:attribute 2:value 3:attribute + +add-highlighter shared/hocc/code/real_dot regex \b([0-9][0-9_]*\.(?!\.)[0-9_]*(e_*[+\-]?_*[0-9][0-9_]*)?)(r(32|64)?)? 1:value 2:attribute +add-highlighter shared/hocc/code/real_e regex \b([0-9][0-9_]*e_*[+\-]?_*[0-9][0-9_]*)(r(32|64)?)?\b 1:value 2:attribute +add-highlighter shared/hocc/code/real_r regex \b([0-9][0-9_]*)(r(32|64)?)\b 1:value 2:attribute + +# Macro +# ‾‾‾‾‾ + +evaluate-commands %sh{ + keywords="and|also|as|conceal|effect|else|expose|external|fn|function|if|import|include|lazy|let" + keywords="${keywords}|match|mutability|of|open|or|rec|then|type|when|with" + keywords="${keywords}|hocc|token|nonterm|start|epsilon|prec|left|right" + + printf %s\\n "declare-option str-list hocc_static_words ${keywords}" | tr '|' ' ' + + printf %s " + add-highlighter shared/hocc/code/ regex \b(${keywords})\b 0:keyword + " +} + +}