-
Notifications
You must be signed in to change notification settings - Fork 63
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ksmbd: add support for surrogate pair conversion
ksmbd is missing supporting to convert filename included surrogate pair characters. It triggers a "file or folder does not exist" error in Windows client. [Steps to Reproduce for bug] # touch $(echo -e '\xf0\x9d\x9f\xa3') # touch $(echo -e '\xf0\x9d\x9f\xa4') Try to open these files in ksmbd share through Windows client. This patch update unicode functions not to consider about surrogate pair (and IVS). Signed-off-by: Namjae Jeon <[email protected]>
- Loading branch information
1 parent
b7e1fb4
commit f389804
Showing
1 changed file
with
138 additions
and
49 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,46 +32,10 @@ int smb1_utf16_name_length(const __le16 *from, int maxbytes) | |
} | ||
#endif | ||
|
||
/* | ||
* smb_utf16_bytes() - how long will a string be after conversion? | ||
* @from: pointer to input string | ||
* @maxbytes: don't go past this many bytes of input string | ||
* @codepage: destination codepage | ||
* | ||
* Walk a utf16le string and return the number of bytes that the string will | ||
* be after being converted to the given charset, not including any null | ||
* termination required. Don't walk past maxbytes in the source buffer. | ||
* | ||
* Return: string length after conversion | ||
*/ | ||
static int smb_utf16_bytes(const __le16 *from, int maxbytes, | ||
const struct nls_table *codepage) | ||
{ | ||
int i; | ||
int charlen, outlen = 0; | ||
int maxwords = maxbytes / 2; | ||
char tmp[NLS_MAX_CHARSET_SIZE]; | ||
__u16 ftmp; | ||
|
||
for (i = 0; i < maxwords; i++) { | ||
ftmp = get_unaligned_le16(&from[i]); | ||
if (ftmp == 0) | ||
break; | ||
|
||
charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE); | ||
if (charlen > 0) | ||
outlen += charlen; | ||
else | ||
outlen++; | ||
} | ||
|
||
return outlen; | ||
} | ||
|
||
/* | ||
* cifs_mapchar() - convert a host-endian char to proper char in codepage | ||
* @target: where converted character should be copied | ||
* @src_char: 2 byte host-endian source character | ||
* @from: host-endian source string | ||
* @cp: codepage to which character should be converted | ||
* @mapchar: should character be mapped according to mapchars mount option? | ||
* | ||
|
@@ -82,10 +46,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes, | |
* Return: string length after conversion | ||
*/ | ||
static int | ||
cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, | ||
cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp, | ||
bool mapchar) | ||
{ | ||
int len = 1; | ||
__u16 src_char; | ||
|
||
src_char = *from; | ||
|
||
if (!mapchar) | ||
goto cp_convert; | ||
|
@@ -123,12 +90,66 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, | |
|
||
cp_convert: | ||
len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); | ||
if (len <= 0) { | ||
*target = '?'; | ||
len = 1; | ||
} | ||
if (len <= 0) | ||
goto surrogate_pair; | ||
|
||
goto out; | ||
|
||
surrogate_pair: | ||
/* convert SURROGATE_PAIR and IVS */ | ||
if (strcmp(cp->charset, "utf8")) | ||
goto unknown; | ||
len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6); | ||
if (len <= 0) | ||
goto unknown; | ||
return len; | ||
|
||
unknown: | ||
*target = '?'; | ||
len = 1; | ||
goto out; | ||
} | ||
|
||
/* | ||
* smb_utf16_bytes() - how long will a string be after conversion? | ||
* @from: pointer to input string | ||
* @maxbytes: don't go past this many bytes of input string | ||
* @codepage: destination codepage | ||
* | ||
* Walk a utf16le string and return the number of bytes that the string will | ||
* be after being converted to the given charset, not including any null | ||
* termination required. Don't walk past maxbytes in the source buffer. | ||
* | ||
* Return: string length after conversion | ||
*/ | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong. |
||
static int smb_utf16_bytes(const __le16 *from, int maxbytes, | ||
const struct nls_table *codepage) | ||
{ | ||
int i, j; | ||
int charlen, outlen = 0; | ||
int maxwords = maxbytes / 2; | ||
char tmp[NLS_MAX_CHARSET_SIZE]; | ||
__u16 ftmp[3]; | ||
|
||
for (i = 0; i < maxwords; i++) { | ||
ftmp[0] = get_unaligned_le16(&from[i]); | ||
if (ftmp[0] == 0) | ||
break; | ||
for (j = 1; j <= 2; j++) { | ||
if (i + j < maxwords) | ||
ftmp[j] = get_unaligned_le16(&from[i + j]); | ||
else | ||
ftmp[j] = 0; | ||
} | ||
|
||
charlen = cifs_mapchar(tmp, ftmp, codepage, 0); | ||
if (charlen > 0) | ||
outlen += charlen; | ||
else | ||
outlen++; | ||
} | ||
|
||
return outlen; | ||
} | ||
|
||
/* | ||
|
@@ -176,12 +197,12 @@ static inline int is_char_allowed(char *ch) | |
static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, | ||
const struct nls_table *codepage, bool mapchar) | ||
{ | ||
int i, charlen, safelen; | ||
int i, j, charlen, safelen; | ||
int outlen = 0; | ||
int nullsize = nls_nullsize(codepage); | ||
int fromwords = fromlen / 2; | ||
char tmp[NLS_MAX_CHARSET_SIZE]; | ||
__u16 ftmp; | ||
__u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */ | ||
|
||
/* | ||
* because the chars can be of varying widths, we need to take care | ||
|
@@ -192,9 +213,15 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, | |
safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); | ||
|
||
for (i = 0; i < fromwords; i++) { | ||
ftmp = get_unaligned_le16(&from[i]); | ||
if (ftmp == 0) | ||
ftmp[0] = get_unaligned_le16(&from[i]); | ||
if (ftmp[0] == 0) | ||
break; | ||
for (j = 1; j <= 2; j++) { | ||
if (i + j < fromwords) | ||
ftmp[j] = get_unaligned_le16(&from[i + j]); | ||
else | ||
ftmp[j] = 0; | ||
} | ||
|
||
/* | ||
* check to see if converting this character might make the | ||
|
@@ -209,6 +236,19 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, | |
/* put converted char into 'to' buffer */ | ||
charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar); | ||
outlen += charlen; | ||
|
||
/* | ||
* charlen (=bytes of UTF-8 for 1 character) | ||
* 4bytes UTF-8(surrogate pair) is charlen=4 | ||
* (4bytes UTF-16 code) | ||
* 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4 | ||
* (2 UTF-8 pairs divided to 2 UTF-16 pairs) | ||
*/ | ||
if (charlen == 4) | ||
i++; | ||
else if (charlen >= 5) | ||
/* 5-6bytes UTF-8 */ | ||
i += 2; | ||
} | ||
|
||
/* properly null-terminate string */ | ||
|
@@ -343,10 +383,15 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen, | |
char src_char; | ||
__le16 dst_char; | ||
wchar_t tmp; | ||
wchar_t *wchar_to; /* UTF-16 */ | ||
int ret; | ||
unicode_t u; | ||
|
||
if (!mapchars) | ||
return smb_strtoUTF16(target, source, srclen, cp); | ||
|
||
wchar_to = kzalloc(6, GFP_KERNEL); | ||
This comment has been minimized.
Sorry, something went wrong.
mmakassikis
|
||
|
||
for (i = 0, j = 0; i < srclen; j++) { | ||
src_char = source[i]; | ||
charlen = 1; | ||
|
@@ -385,11 +430,55 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen, | |
* if no match, use question mark, which at least in | ||
* some cases serves as wild card | ||
*/ | ||
if (charlen < 1) { | ||
dst_char = cpu_to_le16(0x003f); | ||
charlen = 1; | ||
if (charlen > 0) | ||
goto ctoUTF16; | ||
|
||
/* convert SURROGATE_PAIR */ | ||
if (strcmp(cp->charset, "utf8") || !wchar_to) | ||
goto unknown; | ||
if (*(source + i) & 0x80) { | ||
charlen = utf8_to_utf32(source + i, 6, &u); | ||
if (charlen < 0) | ||
goto unknown; | ||
} else | ||
goto unknown; | ||
ret = utf8s_to_utf16s(source + i, charlen, | ||
UTF16_LITTLE_ENDIAN, | ||
wchar_to, 6); | ||
if (ret < 0) | ||
goto unknown; | ||
|
||
i += charlen; | ||
dst_char = cpu_to_le16(*wchar_to); | ||
if (charlen <= 3) | ||
/* 1-3bytes UTF-8 to 2bytes UTF-16 */ | ||
put_unaligned(dst_char, &target[j]); | ||
else if (charlen == 4) { | ||
/* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16 | ||
* 7-8bytes UTF-8(IVS) divided to 2 UTF-16 | ||
* (charlen=3+4 or 4+4) */ | ||
put_unaligned(dst_char, &target[j]); | ||
dst_char = cpu_to_le16(*(wchar_to + 1)); | ||
j++; | ||
put_unaligned(dst_char, &target[j]); | ||
} else if (charlen >= 5) { | ||
/* 5-6bytes UTF-8 to 6bytes UTF-16 */ | ||
put_unaligned(dst_char, &target[j]); | ||
dst_char = cpu_to_le16(*(wchar_to + 1)); | ||
j++; | ||
put_unaligned(dst_char, &target[j]); | ||
dst_char = cpu_to_le16(*(wchar_to + 2)); | ||
j++; | ||
put_unaligned(dst_char, &target[j]); | ||
} | ||
continue; | ||
|
||
unknown: | ||
dst_char = cpu_to_le16(0x003f); | ||
charlen = 1; | ||
} | ||
|
||
ctoUTF16: | ||
/* | ||
* character may take more than one byte in the source string, | ||
* but will take exactly two bytes in the target string | ||
|
minor nits:
"how long will a string be after conversion?" -> "compute converted string length"
"don't go past this many bytes of input string" -> "input string length"