Skip to content

Commit

Permalink
Update the grammar
Browse files Browse the repository at this point in the history
  • Loading branch information
eilvelia committed Oct 25, 2023
1 parent efa0303 commit d238285
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 40 deletions.
2 changes: 1 addition & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@

## 0.1.0 (2022-10-01)

Initial release.
Initial experimental release.
40 changes: 19 additions & 21 deletions GRAMMAR.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@ name ::= ident | string | raw-string
## Tokens

```bnf
spacechar ::= <see the KDL whitespace table>
newline ::= CRLF | CR | LF | NEL | FF | LS | PS
single-line-comment ::= "//" [^ newline]* (newline | EOF)
multi-line-comment ::= "/*" ([^ "*/"]* | multi-line-comment) "*/"
line-cont ::= "\\" single-line-comment
| "\\" newline
line-cont ::= '\' single-line-comment
| '\' newline
identchar ::= [^ "\\" "/" "(" ")" "{" "}" "<" ">" ";" "[" "]" "=" "," '"' 0x0..0x20 ]
identchar ::= [^ '\' "/" "(" ")" "{" "}" "<" ">" ";" "[" "]" "=" "," '"' 0x0..0x20 newline spacechar]
identstart ::= identchar - "0".."9"
ident ::= "-" identstart identchar*
| "-"
| (identstart - "-") identchar*
ident ::= ( sign identstart identchar* | sign
| "r" (identchar - "#") identchar* | "r"
| (identstart - ["r" sign]) identchar*
) - ["true" "false" "null"]
hex-digit ::= "0".."9" | "a".."f" | "A".."F"
dec-digit ::= "0".."9"
Expand All @@ -46,24 +48,21 @@ bin-int ::= sign? "0b" bin-digit (bin-digit | "_")*
integer ::= dec-int | hex-int | oct-int | bin-int
float ::= dec-float
raw-string ::= "r" <any number of #> '"' <any-char>* '"' <the same number of #>
raw-string ::= "r" <any number of #> '"' <any char>* '"' <the same number of #>
string ::= '"' string-character* '"'
string-character ::= <any-regular-char>
| "\n" | "\r" | "\t" | "\\" | '\"' | "\b" | "\f"
| "\u{" hex-digit{1,6} "}"
escape ::= "n" | "r" | "t" | '\' | '"' | "b" | "f" | "u{" hex-digit{1,6} "}"
| (spacechar | newline)+
string-character ::= '\' escape | [^ '"']
```

`ws` (whitespace) is used as a token delimiter, defined as the union of the unicode
characters described [here][], `single-line-comment`, `multi-line-comment`,
and `line-cont`.
Whitespace (defined as the union of `spacechar`, `single-line-comment`,
`multi-line-comment`, and `line-cont`) is skipped.

[here]: https://github.com/kdl-org/kdl/blob/1.0.0/SPEC.md#whitespace
Note that `EOF` is not allowed after the `\` line continuation without a
single-line comment.

A keyword (`true`, `false`, `null`) cannot be an `ident`.

Note that `EOF` is not allowed after the `\` line continuation without a single-line comment.

Whitespace around `=` and after the `)` type annotation is forbiddened by a semantic check.
Whitespace around `=` and after the `)` type annotation is forbiddened by
semantic checks.

---

Expand All @@ -75,5 +74,4 @@ whitespace-insensitive. For example, the KDL spec forbids `node"str1""str2"`,
while `ocaml-kdl` allows it. Everything valid by the KDL spec should also be
valid in `ocaml-kdl`, but the opposite is not always true.

The original grammar is made mostly for recursive descent parsers without a
separate lexical analyzer.
The original grammar seems to be made mostly for top-down parsers.
34 changes: 16 additions & 18 deletions src/lexer.ml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ open Printf
let error msg = raise @@ Err.CustomLexingError msg

(* Note: [Compl] doesn't seem to work *)
let space_chars = [%sedlex.regexp?
let space_char = [%sedlex.regexp?
'\t' (* Character Tabulation U+0009 *)
| 0x000B (* Line Tabulation U+000B *)
| ' ' (* Space U+0020 *)
Expand All @@ -27,9 +27,9 @@ let space_chars = [%sedlex.regexp?
| 0xFEFF (* BOM U+FEFF *)
]

let ws = [%sedlex.regexp? Plus space_chars]
let ws = [%sedlex.regexp? Plus space_char]

let newline_chars = [%sedlex.regexp?
let newline_char = [%sedlex.regexp?
'\r' (* CR Carriage Return U+000D *)
| '\n' (* LF Line Feed U+000A *)
| 0x0085 (* NEL Next Line U+0085 *)
Expand All @@ -38,7 +38,7 @@ let newline_chars = [%sedlex.regexp?
| 0x2029 (* PS Paragraph Separator U+2029 *)
]

let newline = [%sedlex.regexp? "\r\n" | newline_chars]
let newline = [%sedlex.regexp? "\r\n" | newline_char]

(* With this defined as [%sedlex.regexp? ascii_hex_digit], the
[(integer | float) identchar+] case surprisingly doesn't work correctly *)
Expand All @@ -60,15 +60,15 @@ let binary = [%sedlex.regexp? Opt sign, "0b", Chars "01", Star (Chars "01_")]
let integer = [%sedlex.regexp? decimal_int | hex | octal | binary]
let float = [%sedlex.regexp? decimal_float]

(* Disallowed:
(* Disallowed identifier characters as per the spec: {|
Any codepoint with hexadecimal value 0x20 or below
Any codepoint with hexadecimal value higher than 0x10FFFF
Any of {| \/(){}<>;[]=," |} *)
let disallowed_chars = [%sedlex.regexp? Chars "\\/(){}<>;[]=,\""
| 0 .. 0x20
| space_chars
| newline_chars]
let identchar = [%sedlex.regexp? Sub (any, disallowed_chars)]
Any of \/(){}<>;[]=," |} *)
let nonident_char = [%sedlex.regexp? Chars "\\/(){}<>;[]=,\""
| 0 .. 0x20
| space_char
| newline_char]
let identchar = [%sedlex.regexp? Sub (any, nonident_char)]
let startident = [%sedlex.regexp? Sub (identchar, '0'..'9')]

let[@inline] new_line lexbuf =
Expand Down Expand Up @@ -118,6 +118,7 @@ let rec raw_string hashlen lexbuf =
Buffer.add_string string_buffer (Sedlexing.Utf8.lexeme lexbuf);
raw_string hashlen lexbuf
| '"', Star '#' ->
(* TODO: Do not consume more hashes than needed? *)
let hashes =
Sedlexing.Utf8.sub_lexeme lexbuf 1 (Sedlexing.lexeme_length lexbuf - 1) in
let hashlen' = String.length hashes in
Expand Down Expand Up @@ -213,9 +214,8 @@ let rec main lexbuf =
set_string_start lexbuf;
string lexbuf
| "r#", Star identchar -> error "An identifier cannot start with r#"
| '-', startident, Star identchar -> IDENT (Sedlexing.Utf8.lexeme lexbuf)
| '-' -> IDENT "-"
| Sub (startident, '-'), Star identchar -> IDENT (Sedlexing.Utf8.lexeme lexbuf)
| sign, Opt (startident, Star identchar)
| Sub (startident, sign), Star identchar -> IDENT (Sedlexing.Utf8.lexeme lexbuf)
| eof -> EOF
| any -> error @@ sprintf "Illegal character '%s'" (Sedlexing.Utf8.lexeme lexbuf)
| _ -> assert false
Expand Down Expand Up @@ -253,10 +253,8 @@ let rec query lexbuf =
| "^=" -> CARET_EQ
| "$=" -> DOLLAR_EQ
| "*=" -> STAR_EQ
| '-', query_startident, Star query_identchar ->
IDENT (Sedlexing.Utf8.lexeme lexbuf)
| '-' -> IDENT "-"
| Sub (query_startident, '-'), Star query_identchar ->
| sign, Opt (query_startident, Star query_identchar)
| Sub (query_startident, sign), Star query_identchar ->
IDENT (Sedlexing.Utf8.lexeme lexbuf)
| any ->
error @@ sprintf "Illegal character '%s'" (Sedlexing.Utf8.lexeme lexbuf)
Expand Down

0 comments on commit d238285

Please sign in to comment.