Skip to content

Commit

Permalink
WIP Step 3
Browse files Browse the repository at this point in the history
JsonLex.lex: on retourne la chaine convertie au mieux dans le cas STRINGERROR

TextService
- GetValidUTF8CharLengthAt, GetValidUTF8SubStringLength: prend desormais un char* en operande
- JsonToCString: ameliorations
  • Loading branch information
marcboulle committed Nov 5, 2024
1 parent 0f72bc7 commit 3a10d63
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 40 deletions.
17 changes: 7 additions & 10 deletions src/Norm/base/JsonLex.inc
Original file line number Diff line number Diff line change
Expand Up @@ -1116,17 +1116,14 @@ YY_RULE_SETUP
// On retourne la chaine convertie en cas de succes
if (bOk)
return STRINGVALUE;
// Sinon, on retourne la chaine originelle
// Sinon, on retourne la chaine convertie au mieux, mais en tant que chaine erronnee
else
{
*sValue = (char*)&yytext[1];
return STRINGERROR;
}
}
YY_BREAK
case 11:
YY_RULE_SETUP
#line 70 "JsonLex.lex"
#line 67 "JsonLex.lex"
{
char* endptr;
double dValue;
Expand All @@ -1139,12 +1136,12 @@ YY_RULE_SETUP
case 12:
/* rule 12 can match eol */
YY_RULE_SETUP
#line 79 "JsonLex.lex"
#line 76 "JsonLex.lex"
{/*IGNORE*/}
YY_BREAK
case 13:
YY_RULE_SETUP
#line 81 "JsonLex.lex"
#line 78 "JsonLex.lex"
{
ALString *sValue;

Expand All @@ -1155,10 +1152,10 @@ YY_RULE_SETUP
YY_BREAK
case 14:
YY_RULE_SETUP
#line 89 "JsonLex.lex"
#line 86 "JsonLex.lex"
ECHO;
YY_BREAK
#line 1161 "C:/Applications/boullema/DevGit/khiops/src/Norm/base/JsonLex.inc"
#line 1158 "C:/Applications/boullema/DevGit/khiops/src/Norm/base/JsonLex.inc"
case YY_STATE_EOF(INITIAL):
yyterminate();

Expand Down Expand Up @@ -2175,5 +2172,5 @@ void yyfree (void * ptr )

#define YYTABLES_NAME "yytables"

#line 89 "JsonLex.lex"
#line 86 "JsonLex.lex"

5 changes: 1 addition & 4 deletions src/Norm/base/JsonLex.lex
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,9 @@ null {return NULLVALUE;}
// On retourne la chaine convertie en cas de succes
if (bOk)
return STRINGVALUE;
// Sinon, on retourne la chaine originelle
// Sinon, on retourne la chaine convertie au mieux, mais en tant que chaine erronnee
else
{
*sValue = (char*)&yytext[1];
return STRINGERROR;
}
}

{NUMBER} {
Expand Down
59 changes: 35 additions & 24 deletions src/Norm/base/TextService.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,7 @@ boolean TextService::JsonToCString(const char* sJsonString, ALString& sCString)
const char* sCharsToAdd;
ALString sUnicodeChars;
int nCharNumber;
int nUTF8CharLength;

require(sJsonString != NULL);

Expand All @@ -422,6 +423,7 @@ boolean TextService::JsonToCString(const char* sJsonString, ALString& sCString)
{
if (sInputString[nEnd] == '\\')
{
// On concatene ce qui precede
AppendSubString(sCString, sJsonString, nBegin, nEnd - nBegin);
nEnd++;
assert(nEnd < nLength);
Expand Down Expand Up @@ -462,6 +464,7 @@ boolean TextService::JsonToCString(const char* sJsonString, ALString& sCString)
{
bOk = false;
sCharsToAdd = "?";
assert(nCharNumber == 1);
break;
}
assert(nEnd < nLength);
Expand Down Expand Up @@ -517,6 +520,7 @@ boolean TextService::JsonToCString(const char* sJsonString, ALString& sCString)
{
bOk = false;
sCharsToAdd = "?";
assert(nCharNumber == 1);
break;
}
}
Expand All @@ -540,9 +544,9 @@ boolean TextService::JsonToCString(const char* sJsonString, ALString& sCString)
// En principe, impossible avec une chaine json correctement formee
// Dans ce cas, on avance d'un caractere, avec une erreur
bOk = false;
assert(nCharNumber == 1);
sCharsToAdd = "?";
nEnd++;
assert(nCharNumber == 1);
break;
}
if (nCharNumber == 1)
sCString += sCharsToAdd[0];
Expand All @@ -553,14 +557,18 @@ boolean TextService::JsonToCString(const char* sJsonString, ALString& sCString)
}
else
{
nEnd++;
nUTF8CharLength = GetValidUTF8CharLengthAt(sJsonString, nEnd);
if (nUTF8CharLength > 0)
nEnd += nUTF8CharLength;
else
{
// Caractere Utf8 invalide: on avance de 1 avec une erreur
bOk = false;
nEnd++;
}
}
}
AppendSubString(sCString, sJsonString, nBegin, nEnd - nBegin);

// Verification de l'encodage utf8, sauf si on a incorpore des caracteres ansi en mode ForceAnsi
if (bOk and not bContainsAnsiChars)
bOk = GetValidUTF8SubStringLength(sCString) == sCString.GetLength();
return bOk;
}

Expand Down Expand Up @@ -1016,69 +1024,72 @@ const ALString TextService::ToPrintable(const ALString& sBytes)
return sPrintableBytes;
}

int TextService::GetValidUTF8CharLengthAt(const ALString& sValue, int nStart)
int TextService::GetValidUTF8CharLengthAt(const char* sValue, int nStart)
{
int nUtf8CharLength;
int c;
int nLength;
unsigned char c;

require(0 <= nStart and nStart < sValue.GetLength());
require(sValue != NULL);
require(0 <= nStart and sValue[nStart] != '\0');

// Initialisations
nUtf8CharLength = 0;
nLength = sValue.GetLength();
c = (unsigned char)sValue.GetAt(nStart);
c = (unsigned char)sValue[nStart];

// Cas d'un caractere ascii 0bbbbbbb
if (0x00 <= c and c <= 0x7f)
nUtf8CharLength = 1;
// Debut d'un caractere UTF8 sur deux octets 110bbbbb
else if ((c & 0xE0) == 0xC0)
{
if (nStart + 1 < nLength and ((unsigned char)sValue.GetAt(nStart + 1) & 0xC0) == 0x80)
if (((unsigned char)sValue[nStart + 1] & 0xC0) == 0x80)
nUtf8CharLength = 2;
else
nUtf8CharLength = 0;
}
// Debut d'un caractere UTF8 sur trois octets 1110bbbb
else if ((c & 0xF0) == 0xE0)
{
if (nStart + 2 < nLength and ((unsigned char)sValue.GetAt(nStart + 1) & 0xC0) == 0x80 and
((unsigned char)sValue.GetAt(nStart + 2) & 0xC0) == 0x80)
// Test sans risque, puis le second caractere n'est pas teste si le premier vaut '\0'
if (((unsigned char)sValue[nStart + 1] & 0xC0) == 0x80 and
((unsigned char)sValue[nStart + 2] & 0xC0) == 0x80)
nUtf8CharLength = 3;
else
nUtf8CharLength = 0;
}
// Debut d'un caractere UTF8 sur quatre octets 11110bbb
else if ((c & 0xF8) == 0xF0)
{
if (nStart + 3 < nLength and ((unsigned char)sValue.GetAt(nStart + 1) & 0xC0) == 0x80 and
((unsigned char)sValue.GetAt(nStart + 2) & 0xC0) == 0x80 and
((unsigned char)sValue.GetAt(nStart + 3) & 0xC0) == 0x80)
// Test sans risque, puis le troisieme caractere n'est pas teste si un des permier vaut '\0'
if (((unsigned char)sValue[nStart + 1] & 0xC0) == 0x80 and
((unsigned char)sValue[nStart + 2] & 0xC0) == 0x80 and
((unsigned char)sValue[nStart + 3] & 0xC0) == 0x80)
nUtf8CharLength = 4;
else
nUtf8CharLength = 0;
}
return nUtf8CharLength;
}

int TextService::GetValidUTF8SubStringLength(const ALString& sValue)
int TextService::GetValidUTF8SubStringLength(const char* sValue)
{
int nLength;
int nUTF8CharLength;
int nLength;

require(sValue != NULL);

// Parcours de la chaine jusqu'au premiere catactere non UTF8
nLength = 0;
while (nLength < sValue.GetLength())
while (sValue[nLength] != '\0')
{
nUTF8CharLength = GetValidUTF8CharLengthAt(sValue, nLength);
if (nUTF8CharLength > 0)
nLength += nUTF8CharLength;
else
break;
}
assert(nLength <= sValue.GetLength());
assert(nLength == sValue.GetLength() or GetValidUTF8CharLengthAt(sValue, nLength) == 0);
assert(nLength <= (int)strlen(sValue));
assert(nLength == (int)strlen(sValue) or GetValidUTF8CharLengthAt(sValue, nLength) == 0);
return nLength;
}

Expand Down
4 changes: 2 additions & 2 deletions src/Norm/base/TextService.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,10 @@ class TextService : public Object

// Longueur en bytes d'un caractere UTF8 valide a partir d'une position donnee
// Retourne 1 a 4 dans le cas d'un caractere valide, 0 sinon pour un caractere ANSI non encodable directement
static int GetValidUTF8CharLengthAt(const ALString& sValue, int nStart);
static int GetValidUTF8CharLengthAt(const char* sValue, int nStart);

// Longueur en bytes de la sous-partie d'une chaine encodee avec des caracteres UTF8 valide
static int GetValidUTF8SubStringLength(const ALString& sValue);
static int GetValidUTF8SubStringLength(const char* sValue);

// Construction d'un echantillon de textes basiques pour des tests
static void BuildTextSample(StringVector* svTextValues);
Expand Down

0 comments on commit 3a10d63

Please sign in to comment.