Update the grammar

eilvelia · Oct 25, 2023 · d238285 · d238285
1 parent efa0303
commit d238285
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 40 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -18,4 +18,4 @@
 
 ## 0.1.0 (2022-10-01)
 
-Initial release.
+Initial experimental release.
diff --git a/GRAMMAR.md b/GRAMMAR.md
@@ -17,19 +17,21 @@ name ::= ident | string | raw-string
 ## Tokens
 
 ```bnf
+spacechar ::= <see the KDL whitespace table>
 newline ::= CRLF | CR | LF | NEL | FF | LS | PS
 
 single-line-comment ::= "//" [^ newline]* (newline | EOF)
 multi-line-comment ::= "/*" ([^ "*/"]* | multi-line-comment) "*/"
 
-line-cont ::= "\\" single-line-comment
-            | "\\" newline
+line-cont ::= '\' single-line-comment
+            | '\' newline
 
-identchar ::= [^ "\\" "/" "(" ")" "{" "}" "<" ">" ";" "[" "]" "=" "," '"' 0x0..0x20 ]
+identchar ::= [^ '\' "/" "(" ")" "{" "}" "<" ">" ";" "[" "]" "=" "," '"' 0x0..0x20 newline spacechar]
 identstart ::= identchar - "0".."9"
-ident ::= "-" identstart identchar*
-        | "-"
-        | (identstart - "-") identchar*
+ident ::= ( sign identstart identchar* | sign
+          | "r" (identchar - "#") identchar* | "r"
+          | (identstart - ["r" sign]) identchar*
+          ) - ["true" "false" "null"]
 
 hex-digit ::= "0".."9" | "a".."f" | "A".."F"
 dec-digit ::= "0".."9"
@@ -46,24 +48,21 @@ bin-int ::= sign? "0b" bin-digit (bin-digit | "_")*
 integer ::= dec-int | hex-int | oct-int | bin-int
 float ::= dec-float
 
-raw-string ::= "r" <any number of #> '"' <any-char>* '"' <the same number of #>
+raw-string ::= "r" <any number of #> '"' <any char>* '"' <the same number of #>
 string ::= '"' string-character* '"'
-string-character ::= <any-regular-char>
-                   | "\n" | "\r" | "\t" | "\\" | '\"' | "\b" | "\f"
-                   | "\u{" hex-digit{1,6} "}"
+escape ::= "n" | "r" | "t" | '\' | '"' | "b" | "f" | "u{" hex-digit{1,6} "}"
+         | (spacechar | newline)+
+string-character ::= '\' escape | [^ '"']
 ```
 
-`ws` (whitespace) is used as a token delimiter, defined as the union of the unicode
-characters described [here][], `single-line-comment`, `multi-line-comment`,
-and `line-cont`.
+Whitespace (defined as the union of `spacechar`, `single-line-comment`,
+`multi-line-comment`, and `line-cont`) is skipped.
 
-[here]: https://github.com/kdl-org/kdl/blob/1.0.0/SPEC.md#whitespace
+Note that `EOF` is not allowed after the `\` line continuation without a
+single-line comment.
 
-A keyword (`true`, `false`, `null`) cannot be an `ident`.
-
-Note that `EOF` is not allowed after the `\` line continuation without a single-line comment.
-
-Whitespace around `=` and after the `)` type annotation is forbiddened by a semantic check.
+Whitespace around `=` and after the `)` type annotation is forbiddened by
+semantic checks.
 
 ---
 
@@ -75,5 +74,4 @@ whitespace-insensitive. For example, the KDL spec forbids `node"str1""str2"`,
 while `ocaml-kdl` allows it. Everything valid by the KDL spec should also be
 valid in `ocaml-kdl`, but the opposite is not always true.
 
-The original grammar is made mostly for recursive descent parsers without a
-separate lexical analyzer.
+The original grammar seems to be made mostly for top-down parsers.
diff --git a/src/lexer.ml b/src/lexer.ml
@@ -4,7 +4,7 @@ open Printf
 let error msg = raise @@ Err.CustomLexingError msg
 
 (* Note: [Compl] doesn't seem to work *)
-let space_chars = [%sedlex.regexp?
+let space_char = [%sedlex.regexp?
     '\t'   (* Character Tabulation U+0009 *)
   | 0x000B (* Line Tabulation U+000B *)
   | ' '    (* Space U+0020 *)
@@ -27,9 +27,9 @@ let space_chars = [%sedlex.regexp?
   | 0xFEFF (* BOM U+FEFF *)
 ]
 
-let ws = [%sedlex.regexp? Plus space_chars]
+let ws = [%sedlex.regexp? Plus space_char]
 
-let newline_chars = [%sedlex.regexp?
+let newline_char = [%sedlex.regexp?
     '\r'   (* CR   Carriage Return U+000D *)
   | '\n'   (* LF   Line Feed U+000A *)
   | 0x0085 (* NEL  Next Line U+0085 *)
@@ -38,7 +38,7 @@ let newline_chars = [%sedlex.regexp?
   | 0x2029 (* PS   Paragraph Separator U+2029 *)
 ]
 
-let newline = [%sedlex.regexp? "\r\n" | newline_chars]
+let newline = [%sedlex.regexp? "\r\n" | newline_char]
 
 (* With this defined as [%sedlex.regexp? ascii_hex_digit], the
    [(integer | float) identchar+] case surprisingly doesn't work correctly *)
@@ -60,15 +60,15 @@ let binary = [%sedlex.regexp? Opt sign, "0b", Chars "01", Star (Chars "01_")]
 let integer = [%sedlex.regexp? decimal_int | hex | octal | binary]
 let float = [%sedlex.regexp? decimal_float]
 
-(* Disallowed:
+(* Disallowed identifier characters as per the spec: {|
    Any codepoint with hexadecimal value 0x20 or below
    Any codepoint with hexadecimal value higher than 0x10FFFF
-   Any of {| \/(){}<>;[]=," |} *)
-let disallowed_chars = [%sedlex.regexp? Chars "\\/(){}<>;[]=,\""
-                                      | 0 .. 0x20
-                                      | space_chars
-                                      | newline_chars]
-let identchar = [%sedlex.regexp? Sub (any, disallowed_chars)]
+   Any of \/(){}<>;[]=,"    |} *)
+let nonident_char = [%sedlex.regexp? Chars "\\/(){}<>;[]=,\""
+                                     | 0 .. 0x20
+                                     | space_char
+                                     | newline_char]
+let identchar = [%sedlex.regexp? Sub (any, nonident_char)]
 let startident = [%sedlex.regexp? Sub (identchar, '0'..'9')]
 
 let[@inline] new_line lexbuf =
@@ -118,6 +118,7 @@ let rec raw_string hashlen lexbuf =
     Buffer.add_string string_buffer (Sedlexing.Utf8.lexeme lexbuf);
     raw_string hashlen lexbuf
   | '"', Star '#' ->
+    (* TODO: Do not consume more hashes than needed? *)
     let hashes =
       Sedlexing.Utf8.sub_lexeme lexbuf 1 (Sedlexing.lexeme_length lexbuf - 1) in
     let hashlen' = String.length hashes in
@@ -213,9 +214,8 @@ let rec main lexbuf =
     set_string_start lexbuf;
     string lexbuf
   | "r#", Star identchar -> error "An identifier cannot start with r#"
-  | '-', startident, Star identchar -> IDENT (Sedlexing.Utf8.lexeme lexbuf)
-  | '-' -> IDENT "-"
-  | Sub (startident, '-'), Star identchar -> IDENT (Sedlexing.Utf8.lexeme lexbuf)
+  | sign, Opt (startident, Star identchar)
+  | Sub (startident, sign), Star identchar -> IDENT (Sedlexing.Utf8.lexeme lexbuf)
   | eof -> EOF
   | any -> error @@ sprintf "Illegal character '%s'" (Sedlexing.Utf8.lexeme lexbuf)
   | _ -> assert false
@@ -253,10 +253,8 @@ let rec query lexbuf =
   | "^=" -> CARET_EQ
   | "$=" -> DOLLAR_EQ
   | "*=" -> STAR_EQ
-  | '-', query_startident, Star query_identchar ->
-    IDENT (Sedlexing.Utf8.lexeme lexbuf)
-  | '-' -> IDENT "-"
-  | Sub (query_startident, '-'), Star query_identchar ->
+  | sign, Opt (query_startident, Star query_identchar)
+  | Sub (query_startident, sign), Star query_identchar ->
     IDENT (Sedlexing.Utf8.lexeme lexbuf)
   | any ->
     error @@ sprintf "Illegal character '%s'" (Sedlexing.Utf8.lexeme lexbuf)