From 090425e234665887df3c9ac0b0ddebf304098e3a Mon Sep 17 00:00:00 2001
From: Andreas Rossberg <rossberg@mpi-sws.org>
Date: Mon, 25 Sep 2023 15:03:18 +0200
Subject: [PATCH] [spec/interpreter/test] Align definition of newline with
 Unicode recommendation

---
 document/core/text/lexical.rst |  12 +++++++-----
 document/core/util/macros.def  |   1 +
 interpreter/Makefile           |   2 +-
 interpreter/text/lexer.mll     |  32 ++++++++++++++++++--------------
 test/core/comments.wast        | Bin 772 -> 1485 bytes
 5 files changed, 27 insertions(+), 20 deletions(-)
diff --git a/document/core/text/lexical.rst b/document/core/text/lexical.rst
index 1dd34c8635..71f0e453e2 100644
--- a/document/core/text/lexical.rst
+++ b/document/core/text/lexical.rst
@@ -78,14 +78,16 @@ White Space
 ~~~~~~~~~~~
 
 *White space* is any sequence of literal space characters, formatting characters, or :ref:`comments <text-comment>`.
-The allowed formatting characters correspond to a subset of the |ASCII|_ *format effectors*, namely, *horizontal tabulation* (:math:`\unicode{09}`), *line feed* (:math:`\unicode{0A}`), and *carriage return* (:math:`\unicode{0D}`).
+The allowed formatting characters correspond to a subset of the |ASCII|_ *format effectors*, namely, *horizontal tabulation* (:math:`\unicode{09}`), *line feed* (:math:`\unicode{0A}`), and *carriage return* (:math:`\unicode{0D}`), extended with the |Unicode|_ *next line* character (:math:`\unicode{85}`).
 
 .. math::
    \begin{array}{llclll@{\qquad\qquad}l}
    \production{white space} & \Tspace &::=&
      (\text{~~} ~|~ \Tformat ~|~ \Tcomment)^\ast \\
    \production{format} & \Tformat &::=&
-     \unicode{09} ~|~ \unicode{0A} ~|~ \unicode{0D} \\
+     \Tnewline ~|~ \unicode{09} \\
+   \production{newline} & \Tnewline &::=&
+     \unicode{0A} ~|~ \unicode{0D} ~|~ \unicode{0D}~\unicode{0A} ~|~ \unicode{85} \\
    \end{array}
 
 The only relevance of white space is to separate :ref:`tokens <text-token>`. It is otherwise ignored.
@@ -107,13 +109,13 @@ Block comments can be nested.
    \production{comment} & \Tcomment &::=&
      \Tlinecomment ~|~ \Tblockcomment \\
    \production{line comment} & \Tlinecomment &::=&
-     \Tcommentd~~\Tlinechar^\ast~~(\unicode{0A} ~|~ \T{eof}) \\
+     \Tcommentd~~\Tlinechar^\ast~~(\Tnewline ~|~ \T{eof}) \\
    \production{line character} & \Tlinechar &::=&
-     c{:}\Tchar & (\iff c \neq \unicode{0A}) \\
+     c{:}\Tchar & (\iff c \neq \unicode{0A} \land c \neq \unicode{0D} \land c \neq \unicode{85}) \\
    \production{block comment} & \Tblockcomment &::=&
      \Tcommentl~~\Tblockchar^\ast~~\Tcommentr \\
    \production{block character} & \Tblockchar &::=&
-     c{:}\Tchar & (\iff c \neq \text{;} \wedge c \neq \text{(}) \\ &&|&
+     c{:}\Tchar & (\iff c \neq \text{;} \land c \neq \text{(}) \\ &&|&
      \text{;} & (\iff~\mbox{the next character is not}~\text{)}) \\ &&|&
      \text{(} & (\iff~\mbox{the next character is not}~\text{;}) \\ &&|&
      \Tblockcomment \\
diff --git a/document/core/util/macros.def b/document/core/util/macros.def
index df4af62f83..5e7a4889b2 100644
--- a/document/core/util/macros.def
+++ b/document/core/util/macros.def
@@ -700,6 +700,7 @@
 .. |Tchar| mathdef:: \xref{text/lexical}{text-char}{\T{char}}
 .. |Tspace| mathdef:: \xref{text/lexical}{text-space}{\T{space}}
 .. |Tformat| mathdef:: \xref{text/lexical}{text-format}{\T{format}}
+.. |Tnewline| mathdef:: \xref{text/lexical}{text-newline}{\T{newline}}
 
 .. |Ttoken| mathdef:: \xref{text/lexical}{text-token}{\T{token}}
 .. |Tkeyword| mathdef:: \xref{text/lexical}{text-keyword}{\T{keyword}}
diff --git a/interpreter/Makefile b/interpreter/Makefile
index 5939f838ec..dfd9064bbf 100644
--- a/interpreter/Makefile
+++ b/interpreter/Makefile
@@ -33,7 +33,7 @@ zip:		$(ZIP)
 
 # Building
 
-.PHONY:                $(NAME) $(JSLIB)
+.PHONY:		$(NAME) $(JSLIB)
 
 $(NAME):
 	rm -f $@
diff --git a/interpreter/text/lexer.mll b/interpreter/text/lexer.mll
index d9a12b5d21..4df23f1aad 100644
--- a/interpreter/text/lexer.mll
+++ b/interpreter/text/lexer.mll
@@ -27,9 +27,9 @@ let string s =
   while !i < String.length s - 1 do
     let c = if s.[!i] <> '\\' then s.[!i] else
       match (incr i; s.[!i]) with
-      | 'n' -> '\n'
-      | 'r' -> '\r'
-      | 't' -> '\t'
+      | 'n' -> '\x0a'
+      | 'r' -> '\x0d'
+      | 't' -> '\x09'
       | '\\' -> '\\'
       | '\'' -> '\''
       | '\"' -> '\"'
@@ -61,21 +61,25 @@ let letter = ['a'-'z''A'-'Z']
 let symbol =
   ['+''-''*''/''\\''^''~''=''<''>''!''?''@''#''$''%''&''|'':''`''.''\'']
 
-let space = [' ''\t''\n''\r']
+let ascii_newline = ['\x0a''\x0d']
+let newline = ascii_newline | "\x0a\x0d" | "\xc2\x85"
+let space = [' ''\x09''\x0a''\x0d']
 let control = ['\x00'-'\x1f'] # space
 let ascii = ['\x00'-'\x7f']
-let ascii_no_nl = ascii # '\x0a'
+let ascii_no_nl = ascii # ascii_newline
 let utf8cont = ['\x80'-'\xbf']
-let utf8enc =
-    ['\xc2'-'\xdf'] utf8cont
+let utf8enc_no_nl =
+    ['\xc2'] utf8cont#['\x85']
+  | ['\xc3'-'\xdf'] utf8cont
   | ['\xe0'] ['\xa0'-'\xbf'] utf8cont
   | ['\xed'] ['\x80'-'\x9f'] utf8cont
   | ['\xe1'-'\xec''\xee'-'\xef'] utf8cont utf8cont
   | ['\xf0'] ['\x90'-'\xbf'] utf8cont utf8cont
   | ['\xf4'] ['\x80'-'\x8f'] utf8cont utf8cont
   | ['\xf1'-'\xf3'] utf8cont utf8cont utf8cont
+let utf8enc = utf8enc_no_nl | "\xc2\x85"
 let utf8 = ascii | utf8enc
-let utf8_no_nl = ascii_no_nl | utf8enc
+let utf8_no_nl = ascii_no_nl | utf8enc_no_nl
 
 let escape = ['n''r''t''\\''\'''\"']
 let character =
@@ -127,8 +131,8 @@ rule token = parse
   | float as s { FLOAT s }
 
   | string as s { STRING (string s) }
-  | '"'character*('\n'|eof) { error lexbuf "unclosed string literal" }
-  | '"'character*['\x00'-'\x09''\x0b'-'\x1f''\x7f']
+  | '"'character*(newline|eof) { error lexbuf "unclosed string literal" }
+  | '"'character*(control#ascii_newline)
     { error lexbuf "illegal control character in string literal" }
   | '"'character*'\\'_
     { error_nest (Lexing.lexeme_end_p lexbuf) lexbuf "illegal escape" }
@@ -698,11 +702,11 @@ rule token = parse
   | id as s { VAR s }
 
   | ";;"utf8_no_nl*eof { EOF }
-  | ";;"utf8_no_nl*'\n' { Lexing.new_line lexbuf; token lexbuf }
+  | ";;"utf8_no_nl*newline { Lexing.new_line lexbuf; token lexbuf }
   | ";;"utf8_no_nl* { token lexbuf (* causes error on following position *) }
   | "(;" { comment (Lexing.lexeme_start_p lexbuf) lexbuf; token lexbuf }
-  | space#'\n' { token lexbuf }
-  | '\n' { Lexing.new_line lexbuf; token lexbuf }
+  | space#ascii_newline { token lexbuf }
+  | newline { Lexing.new_line lexbuf; token lexbuf }
   | eof { EOF }
 
   | reserved { unknown lexbuf }
@@ -713,7 +717,7 @@ rule token = parse
 and comment start = parse
   | ";)" { () }
   | "(;" { comment (Lexing.lexeme_start_p lexbuf) lexbuf; comment start lexbuf }
-  | '\n' { Lexing.new_line lexbuf; comment start lexbuf }
+  | newline { Lexing.new_line lexbuf; comment start lexbuf }
   | utf8_no_nl { comment start lexbuf }
   | eof { error_nest start lexbuf "unclosed comment" }
   | _ { error lexbuf "malformed UTF-8 encoding" }
diff --git a/test/core/comments.wast b/test/core/comments.wast
index c291370fa72141c02e627c6edf0442bd409129cf..c111c0e07f2839b46aa5e6150d2c49697ab4c914 100644
GIT binary patch
literal 1485
zcmbtT$&M2-5M_rkh<#s6eR4|#Wf=sLPY7wmT)2S5B@)VXS2Grl4Q@9x5kmX}C&Yy_
z|G;fNXnuimI_-86%^|@Dca`h;y;|Htz*{UGWM(`@?Vy}!H`-@Rh}nrZ!(t3*MKK;Z
zW0O}3+hE$7#F2U)zd^lo!5H@^w9lk!XTe)^LoTzsJS<O4WNEwR*pqp0(VhB~-HSIb
zx3{6&?ew-D!?P_SJa+uV$y2A#oISU}&R<{`FI~QJ_1g6tH*ej(bNAl;2M-^8_r!El
zS4wquCJv!6N|`;(K~ht5IdZ5burmSFdB+r;rq?+tIhGEs7x5(YeTDMiH^RGvzqWMQ
zF6uh#!=JwnfBc!NfJI&NQ<<)fg=Pi}j{}ii4=p;N9KxkEvZFb1q3OwBF4BOEqC`-K
zS%{NGfLe-PP+)D4TojdcyJ$ta%sd@xBh=;=DYozI2%P|E1Vd(Y!S|h}_Zs(<)Cd+c
z^Fd1|jSe}(=gK&w%oAQzIs=aTpNw@dOp4wxNr77|E9F2A`f1`}R3q6>GNb9&>NRJi
zGOJ_Q9W^+MuChAD_tW}91FF3x<^Gz=`F|@9)>UpQqV}v^_U3xx)z{}wzpYfce|Mv@
hMC;y%Wc3&G5rL-ehu%(@CU!KozXEo!0(Ns5`w#b+pcMcB

delta 14
VcmX@h-NLrPfRT}RvmxVUMgSon1PcHF