Skip to content

Commit

Permalink
[spec/interpreter/test] Align definition of newline with Unicode reco…
Browse files Browse the repository at this point in the history
…mmendation
  • Loading branch information
rossberg committed Sep 25, 2023
1 parent 404c81c commit 090425e
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 20 deletions.
12 changes: 7 additions & 5 deletions document/core/text/lexical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,16 @@ White Space
~~~~~~~~~~~

*White space* is any sequence of literal space characters, formatting characters, or :ref:`comments <text-comment>`.
The allowed formatting characters correspond to a subset of the |ASCII|_ *format effectors*, namely, *horizontal tabulation* (:math:`\unicode{09}`), *line feed* (:math:`\unicode{0A}`), and *carriage return* (:math:`\unicode{0D}`).
The allowed formatting characters correspond to a subset of the |ASCII|_ *format effectors*, namely, *horizontal tabulation* (:math:`\unicode{09}`), *line feed* (:math:`\unicode{0A}`), and *carriage return* (:math:`\unicode{0D}`), extended with the |Unicode|_ *next line* character (:math:`\unicode{85}`).

.. math::
\begin{array}{llclll@{\qquad\qquad}l}
\production{white space} & \Tspace &::=&
(\text{~~} ~|~ \Tformat ~|~ \Tcomment)^\ast \\
\production{format} & \Tformat &::=&
\unicode{09} ~|~ \unicode{0A} ~|~ \unicode{0D} \\
\Tnewline ~|~ \unicode{09} \\
\production{newline} & \Tnewline &::=&
\unicode{0A} ~|~ \unicode{0D} ~|~ \unicode{0D}~\unicode{0A} ~|~ \unicode{85} \\
\end{array}
The only relevance of white space is to separate :ref:`tokens <text-token>`. It is otherwise ignored.
Expand All @@ -107,13 +109,13 @@ Block comments can be nested.
\production{comment} & \Tcomment &::=&
\Tlinecomment ~|~ \Tblockcomment \\
\production{line comment} & \Tlinecomment &::=&
\Tcommentd~~\Tlinechar^\ast~~(\unicode{0A} ~|~ \T{eof}) \\
\Tcommentd~~\Tlinechar^\ast~~(\Tnewline ~|~ \T{eof}) \\
\production{line character} & \Tlinechar &::=&
c{:}\Tchar & (\iff c \neq \unicode{0A}) \\
c{:}\Tchar & (\iff c \neq \unicode{0A} \land c \neq \unicode{0D} \land c \neq \unicode{85}) \\
\production{block comment} & \Tblockcomment &::=&
\Tcommentl~~\Tblockchar^\ast~~\Tcommentr \\
\production{block character} & \Tblockchar &::=&
c{:}\Tchar & (\iff c \neq \text{;} \wedge c \neq \text{(}) \\ &&|&
c{:}\Tchar & (\iff c \neq \text{;} \land c \neq \text{(}) \\ &&|&
\text{;} & (\iff~\mbox{the next character is not}~\text{)}) \\ &&|&
\text{(} & (\iff~\mbox{the next character is not}~\text{;}) \\ &&|&
\Tblockcomment \\
Expand Down
1 change: 1 addition & 0 deletions document/core/util/macros.def
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,7 @@
.. |Tchar| mathdef:: \xref{text/lexical}{text-char}{\T{char}}
.. |Tspace| mathdef:: \xref{text/lexical}{text-space}{\T{space}}
.. |Tformat| mathdef:: \xref{text/lexical}{text-format}{\T{format}}
.. |Tnewline| mathdef:: \xref{text/lexical}{text-newline}{\T{newline}}

.. |Ttoken| mathdef:: \xref{text/lexical}{text-token}{\T{token}}
.. |Tkeyword| mathdef:: \xref{text/lexical}{text-keyword}{\T{keyword}}
Expand Down
2 changes: 1 addition & 1 deletion interpreter/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ zip: $(ZIP)

# Building

.PHONY: $(NAME) $(JSLIB)
.PHONY: $(NAME) $(JSLIB)

$(NAME):
rm -f $@
Expand Down
32 changes: 18 additions & 14 deletions interpreter/text/lexer.mll
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ let string s =
while !i < String.length s - 1 do
let c = if s.[!i] <> '\\' then s.[!i] else
match (incr i; s.[!i]) with
| 'n' -> '\n'
| 'r' -> '\r'
| 't' -> '\t'
| 'n' -> '\x0a'
| 'r' -> '\x0d'
| 't' -> '\x09'
| '\\' -> '\\'
| '\'' -> '\''
| '\"' -> '\"'
Expand Down Expand Up @@ -61,21 +61,25 @@ let letter = ['a'-'z''A'-'Z']
let symbol =
['+''-''*''/''\\''^''~''=''<''>''!''?''@''#''$''%''&''|'':''`''.''\'']

let space = [' ''\t''\n''\r']
let ascii_newline = ['\x0a''\x0d']
let newline = ascii_newline | "\x0a\x0d" | "\xc2\x85"
let space = [' ''\x09''\x0a''\x0d']
let control = ['\x00'-'\x1f'] # space
let ascii = ['\x00'-'\x7f']
let ascii_no_nl = ascii # '\x0a'
let ascii_no_nl = ascii # ascii_newline
let utf8cont = ['\x80'-'\xbf']
let utf8enc =
['\xc2'-'\xdf'] utf8cont
let utf8enc_no_nl =
['\xc2'] utf8cont#['\x85']
| ['\xc3'-'\xdf'] utf8cont
| ['\xe0'] ['\xa0'-'\xbf'] utf8cont
| ['\xed'] ['\x80'-'\x9f'] utf8cont
| ['\xe1'-'\xec''\xee'-'\xef'] utf8cont utf8cont
| ['\xf0'] ['\x90'-'\xbf'] utf8cont utf8cont
| ['\xf4'] ['\x80'-'\x8f'] utf8cont utf8cont
| ['\xf1'-'\xf3'] utf8cont utf8cont utf8cont
let utf8enc = utf8enc_no_nl | "\xc2\x85"
let utf8 = ascii | utf8enc
let utf8_no_nl = ascii_no_nl | utf8enc
let utf8_no_nl = ascii_no_nl | utf8enc_no_nl

let escape = ['n''r''t''\\''\'''\"']
let character =
Expand Down Expand Up @@ -127,8 +131,8 @@ rule token = parse
| float as s { FLOAT s }

| string as s { STRING (string s) }
| '"'character*('\n'|eof) { error lexbuf "unclosed string literal" }
| '"'character*['\x00'-'\x09''\x0b'-'\x1f''\x7f']
| '"'character*(newline|eof) { error lexbuf "unclosed string literal" }
| '"'character*(control#ascii_newline)
{ error lexbuf "illegal control character in string literal" }
| '"'character*'\\'_
{ error_nest (Lexing.lexeme_end_p lexbuf) lexbuf "illegal escape" }
Expand Down Expand Up @@ -698,11 +702,11 @@ rule token = parse
| id as s { VAR s }

| ";;"utf8_no_nl*eof { EOF }
| ";;"utf8_no_nl*'\n' { Lexing.new_line lexbuf; token lexbuf }
| ";;"utf8_no_nl*newline { Lexing.new_line lexbuf; token lexbuf }
| ";;"utf8_no_nl* { token lexbuf (* causes error on following position *) }
| "(;" { comment (Lexing.lexeme_start_p lexbuf) lexbuf; token lexbuf }
| space#'\n' { token lexbuf }
| '\n' { Lexing.new_line lexbuf; token lexbuf }
| space#ascii_newline { token lexbuf }
| newline { Lexing.new_line lexbuf; token lexbuf }
| eof { EOF }

| reserved { unknown lexbuf }
Expand All @@ -713,7 +717,7 @@ rule token = parse
and comment start = parse
| ";)" { () }
| "(;" { comment (Lexing.lexeme_start_p lexbuf) lexbuf; comment start lexbuf }
| '\n' { Lexing.new_line lexbuf; comment start lexbuf }
| newline { Lexing.new_line lexbuf; comment start lexbuf }
| utf8_no_nl { comment start lexbuf }
| eof { error_nest start lexbuf "unclosed comment" }
| _ { error lexbuf "malformed UTF-8 encoding" }
Binary file modified test/core/comments.wast
Binary file not shown.

0 comments on commit 090425e

Please sign in to comment.