From 090425e234665887df3c9ac0b0ddebf304098e3a Mon Sep 17 00:00:00 2001 From: Andreas Rossberg Date: Mon, 25 Sep 2023 15:03:18 +0200 Subject: [PATCH] [spec/interpreter/test] Align definition of newline with Unicode recommendation --- document/core/text/lexical.rst | 12 +++++++----- document/core/util/macros.def | 1 + interpreter/Makefile | 2 +- interpreter/text/lexer.mll | 32 ++++++++++++++++++-------------- test/core/comments.wast | Bin 772 -> 1485 bytes 5 files changed, 27 insertions(+), 20 deletions(-) diff --git a/document/core/text/lexical.rst b/document/core/text/lexical.rst index 1dd34c8635..71f0e453e2 100644 --- a/document/core/text/lexical.rst +++ b/document/core/text/lexical.rst @@ -78,14 +78,16 @@ White Space ~~~~~~~~~~~ *White space* is any sequence of literal space characters, formatting characters, or :ref:`comments `. -The allowed formatting characters correspond to a subset of the |ASCII|_ *format effectors*, namely, *horizontal tabulation* (:math:`\unicode{09}`), *line feed* (:math:`\unicode{0A}`), and *carriage return* (:math:`\unicode{0D}`). +The allowed formatting characters correspond to a subset of the |ASCII|_ *format effectors*, namely, *horizontal tabulation* (:math:`\unicode{09}`), *line feed* (:math:`\unicode{0A}`), and *carriage return* (:math:`\unicode{0D}`), extended with the |Unicode|_ *next line* character (:math:`\unicode{85}`). .. math:: \begin{array}{llclll@{\qquad\qquad}l} \production{white space} & \Tspace &::=& (\text{~~} ~|~ \Tformat ~|~ \Tcomment)^\ast \\ \production{format} & \Tformat &::=& - \unicode{09} ~|~ \unicode{0A} ~|~ \unicode{0D} \\ + \Tnewline ~|~ \unicode{09} \\ + \production{newline} & \Tnewline &::=& + \unicode{0A} ~|~ \unicode{0D} ~|~ \unicode{0D}~\unicode{0A} ~|~ \unicode{85} \\ \end{array} The only relevance of white space is to separate :ref:`tokens `. It is otherwise ignored. @@ -107,13 +109,13 @@ Block comments can be nested. \production{comment} & \Tcomment &::=& \Tlinecomment ~|~ \Tblockcomment \\ \production{line comment} & \Tlinecomment &::=& - \Tcommentd~~\Tlinechar^\ast~~(\unicode{0A} ~|~ \T{eof}) \\ + \Tcommentd~~\Tlinechar^\ast~~(\Tnewline ~|~ \T{eof}) \\ \production{line character} & \Tlinechar &::=& - c{:}\Tchar & (\iff c \neq \unicode{0A}) \\ + c{:}\Tchar & (\iff c \neq \unicode{0A} \land c \neq \unicode{0D} \land c \neq \unicode{85}) \\ \production{block comment} & \Tblockcomment &::=& \Tcommentl~~\Tblockchar^\ast~~\Tcommentr \\ \production{block character} & \Tblockchar &::=& - c{:}\Tchar & (\iff c \neq \text{;} \wedge c \neq \text{(}) \\ &&|& + c{:}\Tchar & (\iff c \neq \text{;} \land c \neq \text{(}) \\ &&|& \text{;} & (\iff~\mbox{the next character is not}~\text{)}) \\ &&|& \text{(} & (\iff~\mbox{the next character is not}~\text{;}) \\ &&|& \Tblockcomment \\ diff --git a/document/core/util/macros.def b/document/core/util/macros.def index df4af62f83..5e7a4889b2 100644 --- a/document/core/util/macros.def +++ b/document/core/util/macros.def @@ -700,6 +700,7 @@ .. |Tchar| mathdef:: \xref{text/lexical}{text-char}{\T{char}} .. |Tspace| mathdef:: \xref{text/lexical}{text-space}{\T{space}} .. |Tformat| mathdef:: \xref{text/lexical}{text-format}{\T{format}} +.. |Tnewline| mathdef:: \xref{text/lexical}{text-newline}{\T{newline}} .. |Ttoken| mathdef:: \xref{text/lexical}{text-token}{\T{token}} .. |Tkeyword| mathdef:: \xref{text/lexical}{text-keyword}{\T{keyword}} diff --git a/interpreter/Makefile b/interpreter/Makefile index 5939f838ec..dfd9064bbf 100644 --- a/interpreter/Makefile +++ b/interpreter/Makefile @@ -33,7 +33,7 @@ zip: $(ZIP) # Building -.PHONY: $(NAME) $(JSLIB) +.PHONY: $(NAME) $(JSLIB) $(NAME): rm -f $@ diff --git a/interpreter/text/lexer.mll b/interpreter/text/lexer.mll index d9a12b5d21..4df23f1aad 100644 --- a/interpreter/text/lexer.mll +++ b/interpreter/text/lexer.mll @@ -27,9 +27,9 @@ let string s = while !i < String.length s - 1 do let c = if s.[!i] <> '\\' then s.[!i] else match (incr i; s.[!i]) with - | 'n' -> '\n' - | 'r' -> '\r' - | 't' -> '\t' + | 'n' -> '\x0a' + | 'r' -> '\x0d' + | 't' -> '\x09' | '\\' -> '\\' | '\'' -> '\'' | '\"' -> '\"' @@ -61,21 +61,25 @@ let letter = ['a'-'z''A'-'Z'] let symbol = ['+''-''*''/''\\''^''~''=''<''>''!''?''@''#''$''%''&''|'':''`''.''\''] -let space = [' ''\t''\n''\r'] +let ascii_newline = ['\x0a''\x0d'] +let newline = ascii_newline | "\x0a\x0d" | "\xc2\x85" +let space = [' ''\x09''\x0a''\x0d'] let control = ['\x00'-'\x1f'] # space let ascii = ['\x00'-'\x7f'] -let ascii_no_nl = ascii # '\x0a' +let ascii_no_nl = ascii # ascii_newline let utf8cont = ['\x80'-'\xbf'] -let utf8enc = - ['\xc2'-'\xdf'] utf8cont +let utf8enc_no_nl = + ['\xc2'] utf8cont#['\x85'] + | ['\xc3'-'\xdf'] utf8cont | ['\xe0'] ['\xa0'-'\xbf'] utf8cont | ['\xed'] ['\x80'-'\x9f'] utf8cont | ['\xe1'-'\xec''\xee'-'\xef'] utf8cont utf8cont | ['\xf0'] ['\x90'-'\xbf'] utf8cont utf8cont | ['\xf4'] ['\x80'-'\x8f'] utf8cont utf8cont | ['\xf1'-'\xf3'] utf8cont utf8cont utf8cont +let utf8enc = utf8enc_no_nl | "\xc2\x85" let utf8 = ascii | utf8enc -let utf8_no_nl = ascii_no_nl | utf8enc +let utf8_no_nl = ascii_no_nl | utf8enc_no_nl let escape = ['n''r''t''\\''\'''\"'] let character = @@ -127,8 +131,8 @@ rule token = parse | float as s { FLOAT s } | string as s { STRING (string s) } - | '"'character*('\n'|eof) { error lexbuf "unclosed string literal" } - | '"'character*['\x00'-'\x09''\x0b'-'\x1f''\x7f'] + | '"'character*(newline|eof) { error lexbuf "unclosed string literal" } + | '"'character*(control#ascii_newline) { error lexbuf "illegal control character in string literal" } | '"'character*'\\'_ { error_nest (Lexing.lexeme_end_p lexbuf) lexbuf "illegal escape" } @@ -698,11 +702,11 @@ rule token = parse | id as s { VAR s } | ";;"utf8_no_nl*eof { EOF } - | ";;"utf8_no_nl*'\n' { Lexing.new_line lexbuf; token lexbuf } + | ";;"utf8_no_nl*newline { Lexing.new_line lexbuf; token lexbuf } | ";;"utf8_no_nl* { token lexbuf (* causes error on following position *) } | "(;" { comment (Lexing.lexeme_start_p lexbuf) lexbuf; token lexbuf } - | space#'\n' { token lexbuf } - | '\n' { Lexing.new_line lexbuf; token lexbuf } + | space#ascii_newline { token lexbuf } + | newline { Lexing.new_line lexbuf; token lexbuf } | eof { EOF } | reserved { unknown lexbuf } @@ -713,7 +717,7 @@ rule token = parse and comment start = parse | ";)" { () } | "(;" { comment (Lexing.lexeme_start_p lexbuf) lexbuf; comment start lexbuf } - | '\n' { Lexing.new_line lexbuf; comment start lexbuf } + | newline { Lexing.new_line lexbuf; comment start lexbuf } | utf8_no_nl { comment start lexbuf } | eof { error_nest start lexbuf "unclosed comment" } | _ { error lexbuf "malformed UTF-8 encoding" } diff --git a/test/core/comments.wast b/test/core/comments.wast index c291370fa72141c02e627c6edf0442bd409129cf..c111c0e07f2839b46aa5e6150d2c49697ab4c914 100644 GIT binary patch literal 1485 zcmbtT$&M2-5M_rkh<#s6eR4|#Wf=sLPY7wmT)2S5B@)VXS2Grl4Q@9x5kmX}C&Yy_ z|G;fNXnuimI_-86%^|@Dca`h;y;|Htz*{UGWM(`@?Vy}!H`-@Rh}nrZ!(t3*MKK;Z zW0O}3+hE$7#F2U)zd^lo!5H@^w9lk!XTe)^LoTzsJSGNb9&>NRJi zGOJ_Q9W^+MuChAD_tW}91FF3x<^Gz=`F|@9)>UpQqV}v^_U3xx)z{}wzpYfce|Mv@ hMC;y%Wc3&G5rL-ehu%(@CU!KozXEo!0(Ns5`w#b+pcMcB delta 14 VcmX@h-NLrPfRT}RvmxVUMgSon1PcHF