WIP Step 3

JsonLex.lex: on retourne la chaine convertie au mieux dans le cas STRINGERROR TextService - GetValidUTF8CharLengthAt, GetValidUTF8SubStringLength: prend desormais un char* en operande - JsonToCString: ameliorations
KhiopsML · Nov 5, 2024 · 3a10d63 · 3a10d63
1 parent 0f72bc7
commit 3a10d63
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 40 deletions.
diff --git a/src/Norm/base/JsonLex.inc b/src/Norm/base/JsonLex.inc
@@ -1116,17 +1116,14 @@ YY_RULE_SETUP
                     // On retourne la chaine convertie en cas de succes
                     if (bOk)
                         return STRINGVALUE;
-                    // Sinon, on retourne la chaine originelle
+                    // Sinon, on retourne la chaine convertie au mieux, mais en tant que chaine erronnee
                     else
-                    {
-                        *sValue = (char*)&yytext[1];
                         return STRINGERROR;
-                    }
                 }
 	YY_BREAK
 case 11:
 YY_RULE_SETUP
-#line 70 "JsonLex.lex"
+#line 67 "JsonLex.lex"
 {
                     char* endptr;
                     double dValue;
@@ -1139,12 +1136,12 @@ YY_RULE_SETUP
 case 12:
 /* rule 12 can match eol */
 YY_RULE_SETUP
-#line 79 "JsonLex.lex"
+#line 76 "JsonLex.lex"
 {/*IGNORE*/}
 	YY_BREAK
 case 13:
 YY_RULE_SETUP
-#line 81 "JsonLex.lex"
+#line 78 "JsonLex.lex"
 {
                     ALString *sValue;
 
@@ -1155,10 +1152,10 @@ YY_RULE_SETUP
 	YY_BREAK
 case 14:
 YY_RULE_SETUP
-#line 89 "JsonLex.lex"
+#line 86 "JsonLex.lex"
 ECHO;
 	YY_BREAK
-#line 1161 "C:/Applications/boullema/DevGit/khiops/src/Norm/base/JsonLex.inc"
+#line 1158 "C:/Applications/boullema/DevGit/khiops/src/Norm/base/JsonLex.inc"
 case YY_STATE_EOF(INITIAL):
 	yyterminate();
 
@@ -2175,5 +2172,5 @@ void yyfree (void * ptr )
 
 #define YYTABLES_NAME "yytables"
 
-#line 89 "JsonLex.lex"
+#line 86 "JsonLex.lex"
 
diff --git a/src/Norm/base/JsonLex.lex b/src/Norm/base/JsonLex.lex
@@ -59,12 +59,9 @@ null            {return NULLVALUE;}
                     // On retourne la chaine convertie en cas de succes
                     if (bOk)
                         return STRINGVALUE;
-                    // Sinon, on retourne la chaine originelle
+                    // Sinon, on retourne la chaine convertie au mieux, mais en tant que chaine erronnee
                     else
-                    {
-                        *sValue = (char*)&yytext[1];
                         return STRINGERROR;
-                    }
                 }
 
 {NUMBER}        {

diff --git a/src/Norm/base/TextService.cpp b/src/Norm/base/TextService.cpp
@@ -400,6 +400,7 @@ boolean TextService::JsonToCString(const char* sJsonString, ALString& sCString)
 	const char* sCharsToAdd;
 	ALString sUnicodeChars;
 	int nCharNumber;
+	int nUTF8CharLength;
 
 	require(sJsonString != NULL);
 
@@ -422,6 +423,7 @@ boolean TextService::JsonToCString(const char* sJsonString, ALString& sCString)
 	{
 		if (sInputString[nEnd] == '\\')
 		{
+			// On concatene ce qui precede
 			AppendSubString(sCString, sJsonString, nBegin, nEnd - nBegin);
 			nEnd++;
 			assert(nEnd < nLength);
@@ -462,6 +464,7 @@ boolean TextService::JsonToCString(const char* sJsonString, ALString& sCString)
 				{
 					bOk = false;
 					sCharsToAdd = "?";
+					assert(nCharNumber == 1);
 					break;
 				}
 				assert(nEnd < nLength);
@@ -517,6 +520,7 @@ boolean TextService::JsonToCString(const char* sJsonString, ALString& sCString)
 						{
 							bOk = false;
 							sCharsToAdd = "?";
+							assert(nCharNumber == 1);
 							break;
 						}
 					}
@@ -540,9 +544,9 @@ boolean TextService::JsonToCString(const char* sJsonString, ALString& sCString)
 				// En principe, impossible avec une chaine json correctement formee
 				// Dans ce cas, on avance d'un caractere, avec une erreur
 				bOk = false;
-				assert(nCharNumber == 1);
 				sCharsToAdd = "?";
-				nEnd++;
+				assert(nCharNumber == 1);
+				break;
 			}
 			if (nCharNumber == 1)
 				sCString += sCharsToAdd[0];
@@ -553,14 +557,18 @@ boolean TextService::JsonToCString(const char* sJsonString, ALString& sCString)
 		}
 		else
 		{
-			nEnd++;
+			nUTF8CharLength = GetValidUTF8CharLengthAt(sJsonString, nEnd);
+			if (nUTF8CharLength > 0)
+				nEnd += nUTF8CharLength;
+			else
+			{
+				// Caractere Utf8 invalide: on avance de 1 avec une erreur
+				bOk = false;
+				nEnd++;
+			}
 		}
 	}
 	AppendSubString(sCString, sJsonString, nBegin, nEnd - nBegin);
-
-	// Verification de l'encodage utf8, sauf si on a incorpore des caracteres ansi en mode ForceAnsi
-	if (bOk and not bContainsAnsiChars)
-		bOk = GetValidUTF8SubStringLength(sCString) == sCString.GetLength();
 	return bOk;
 }
 
@@ -1016,69 +1024,72 @@ const ALString TextService::ToPrintable(const ALString& sBytes)
 	return sPrintableBytes;
 }
 
-int TextService::GetValidUTF8CharLengthAt(const ALString& sValue, int nStart)
+int TextService::GetValidUTF8CharLengthAt(const char* sValue, int nStart)
 {
 	int nUtf8CharLength;
-	int c;
-	int nLength;
+	unsigned char c;
 
-	require(0 <= nStart and nStart < sValue.GetLength());
+	require(sValue != NULL);
+	require(0 <= nStart and sValue[nStart] != '\0');
 
 	// Initialisations
 	nUtf8CharLength = 0;
-	nLength = sValue.GetLength();
-	c = (unsigned char)sValue.GetAt(nStart);
+	c = (unsigned char)sValue[nStart];
 
 	// Cas d'un caractere ascii 0bbbbbbb
 	if (0x00 <= c and c <= 0x7f)
 		nUtf8CharLength = 1;
 	// Debut d'un caractere UTF8 sur deux octets 110bbbbb
 	else if ((c & 0xE0) == 0xC0)
 	{
-		if (nStart + 1 < nLength and ((unsigned char)sValue.GetAt(nStart + 1) & 0xC0) == 0x80)
+		if (((unsigned char)sValue[nStart + 1] & 0xC0) == 0x80)
 			nUtf8CharLength = 2;
 		else
 			nUtf8CharLength = 0;
 	}
 	// Debut d'un caractere UTF8 sur trois octets 1110bbbb
 	else if ((c & 0xF0) == 0xE0)
 	{
-		if (nStart + 2 < nLength and ((unsigned char)sValue.GetAt(nStart + 1) & 0xC0) == 0x80 and
-		    ((unsigned char)sValue.GetAt(nStart + 2) & 0xC0) == 0x80)
+		// Test sans risque, puis le second caractere n'est pas teste si le premier vaut '\0'
+		if (((unsigned char)sValue[nStart + 1] & 0xC0) == 0x80 and
+		    ((unsigned char)sValue[nStart + 2] & 0xC0) == 0x80)
 			nUtf8CharLength = 3;
 		else
 			nUtf8CharLength = 0;
 	}
 	// Debut d'un caractere UTF8 sur quatre octets 11110bbb
 	else if ((c & 0xF8) == 0xF0)
 	{
-		if (nStart + 3 < nLength and ((unsigned char)sValue.GetAt(nStart + 1) & 0xC0) == 0x80 and
-		    ((unsigned char)sValue.GetAt(nStart + 2) & 0xC0) == 0x80 and
-		    ((unsigned char)sValue.GetAt(nStart + 3) & 0xC0) == 0x80)
+		// Test sans risque, puis le troisieme caractere n'est pas teste si un des permier vaut '\0'
+		if (((unsigned char)sValue[nStart + 1] & 0xC0) == 0x80 and
+		    ((unsigned char)sValue[nStart + 2] & 0xC0) == 0x80 and
+		    ((unsigned char)sValue[nStart + 3] & 0xC0) == 0x80)
 			nUtf8CharLength = 4;
 		else
 			nUtf8CharLength = 0;
 	}
 	return nUtf8CharLength;
 }
 
-int TextService::GetValidUTF8SubStringLength(const ALString& sValue)
+int TextService::GetValidUTF8SubStringLength(const char* sValue)
 {
-	int nLength;
 	int nUTF8CharLength;
+	int nLength;
+
+	require(sValue != NULL);
 
 	// Parcours de la chaine jusqu'au premiere catactere non UTF8
 	nLength = 0;
-	while (nLength < sValue.GetLength())
+	while (sValue[nLength] != '\0')
 	{
 		nUTF8CharLength = GetValidUTF8CharLengthAt(sValue, nLength);
 		if (nUTF8CharLength > 0)
 			nLength += nUTF8CharLength;
 		else
 			break;
 	}
-	assert(nLength <= sValue.GetLength());
-	assert(nLength == sValue.GetLength() or GetValidUTF8CharLengthAt(sValue, nLength) == 0);
+	assert(nLength <= (int)strlen(sValue));
+	assert(nLength == (int)strlen(sValue) or GetValidUTF8CharLengthAt(sValue, nLength) == 0);
 	return nLength;
 }
 

diff --git a/src/Norm/base/TextService.h b/src/Norm/base/TextService.h
@@ -138,10 +138,10 @@ class TextService : public Object
 
 	// Longueur en bytes d'un caractere UTF8 valide a partir d'une position donnee
 	// Retourne 1 a 4 dans le cas d'un caractere valide, 0 sinon pour un caractere ANSI non encodable directement
-	static int GetValidUTF8CharLengthAt(const ALString& sValue, int nStart);
+	static int GetValidUTF8CharLengthAt(const char* sValue, int nStart);
 
 	// Longueur en bytes de la sous-partie d'une chaine encodee avec des caracteres UTF8 valide
-	static int GetValidUTF8SubStringLength(const ALString& sValue);
+	static int GetValidUTF8SubStringLength(const char* sValue);
 
 	// Construction d'un echantillon de textes basiques pour des tests
 	static void BuildTextSample(StringVector* svTextValues);