diff --git a/unicode.c b/unicode.c index b21b5c1b2..4d84791b6 100644 --- a/unicode.c +++ b/unicode.c @@ -32,46 +32,10 @@ int smb1_utf16_name_length(const __le16 *from, int maxbytes) } #endif -/* - * smb_utf16_bytes() - how long will a string be after conversion? - * @from: pointer to input string - * @maxbytes: don't go past this many bytes of input string - * @codepage: destination codepage - * - * Walk a utf16le string and return the number of bytes that the string will - * be after being converted to the given charset, not including any null - * termination required. Don't walk past maxbytes in the source buffer. - * - * Return: string length after conversion - */ -static int smb_utf16_bytes(const __le16 *from, int maxbytes, - const struct nls_table *codepage) -{ - int i; - int charlen, outlen = 0; - int maxwords = maxbytes / 2; - char tmp[NLS_MAX_CHARSET_SIZE]; - __u16 ftmp; - - for (i = 0; i < maxwords; i++) { - ftmp = get_unaligned_le16(&from[i]); - if (ftmp == 0) - break; - - charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE); - if (charlen > 0) - outlen += charlen; - else - outlen++; - } - - return outlen; -} - /* * cifs_mapchar() - convert a host-endian char to proper char in codepage * @target: where converted character should be copied - * @src_char: 2 byte host-endian source character + * @from: host-endian source string * @cp: codepage to which character should be converted * @mapchar: should character be mapped according to mapchars mount option? * @@ -82,10 +46,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes, * Return: string length after conversion */ static int -cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, +cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp, bool mapchar) { int len = 1; + __u16 src_char; + + src_char = *from; if (!mapchar) goto cp_convert; @@ -123,12 +90,68 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, cp_convert: len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); - if (len <= 0) { - *target = '?'; - len = 1; - } + if (len <= 0) + goto surrogate_pair; goto out; + +surrogate_pair: + /* convert SURROGATE_PAIR and IVS */ + if (strcmp(cp->charset, "utf8")) + goto unknown; + len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6); + if (len <= 0) + goto unknown; + return len; + +unknown: + *target = '?'; + len = 1; + goto out; +} + +/* + * smb_utf16_bytes() - how long will a string be after conversion? + * @from: pointer to input string + * @maxbytes: don't go past this many bytes of input string + * @codepage: destination codepage + * + * Walk a utf16le string and return the number of bytes that the string will + * be after being converted to the given charset, not including any null + * termination required. Don't walk past maxbytes in the source buffer. + * + * Return: string length after conversion + */ +static int smb_utf16_bytes(const __le16 *from, int maxbytes, + const struct nls_table *codepage) +{ + int i; + int charlen, outlen = 0; + int maxwords = maxbytes / 2; + char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp[3]; + + for (i = 0; i < maxwords; i++) { + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) + break; + if (i + 1 < maxwords) + ftmp[1] = get_unaligned_le16(&from[i + 1]); + else + ftmp[1] = 0; + if (i + 2 < maxwords) + ftmp[2] = get_unaligned_le16(&from[i + 2]); + else + ftmp[2] = 0; + + charlen = cifs_mapchar(tmp, ftmp, codepage, 0); + if (charlen > 0) + outlen += charlen; + else + outlen++; + } + + return outlen; } /* @@ -181,7 +204,7 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, int nullsize = nls_nullsize(codepage); int fromwords = fromlen / 2; char tmp[NLS_MAX_CHARSET_SIZE]; - __u16 ftmp; + __u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */ /* * because the chars can be of varying widths, we need to take care @@ -192,9 +215,17 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); for (i = 0; i < fromwords; i++) { - ftmp = get_unaligned_le16(&from[i]); - if (ftmp == 0) + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) break; + if (i + 1 < fromwords) + ftmp[1] = get_unaligned_le16(&from[i + 1]); + else + ftmp[1] = 0; + if (i + 2 < fromwords) + ftmp[2] = get_unaligned_le16(&from[i + 2]); + else + ftmp[2] = 0; /* * check to see if converting this character might make the @@ -209,6 +240,19 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, /* put converted char into 'to' buffer */ charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar); outlen += charlen; + + /* + * charlen (=bytes of UTF-8 for 1 character) + * 4bytes UTF-8(surrogate pair) is charlen=4 + * (4bytes UTF-16 code) + * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4 + * (2 UTF-8 pairs divided to 2 UTF-16 pairs) + */ + if (charlen == 4) + i++; + else if (charlen >= 5) + /* 5-6bytes UTF-8 */ + i += 2; } /* properly null-terminate string */ @@ -343,10 +387,15 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen, char src_char; __le16 dst_char; wchar_t tmp; + wchar_t *wchar_to; /* UTF-16 */ + int ret; + unicode_t u; if (!mapchars) return smb_strtoUTF16(target, source, srclen, cp); + wchar_to = kzalloc(6, GFP_KERNEL); + for (i = 0, j = 0; i < srclen; j++) { src_char = source[i]; charlen = 1; @@ -385,11 +434,55 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen, * if no match, use question mark, which at least in * some cases serves as wild card */ - if (charlen < 1) { - dst_char = cpu_to_le16(0x003f); - charlen = 1; + if (charlen > 0) + goto ctoUTF16; + + /* convert SURROGATE_PAIR */ + if (strcmp(cp->charset, "utf8") || !wchar_to) + goto unknown; + if (*(source + i) & 0x80) { + charlen = utf8_to_utf32(source + i, 6, &u); + if (charlen < 0) + goto unknown; + } else + goto unknown; + ret = utf8s_to_utf16s(source + i, charlen, + UTF16_LITTLE_ENDIAN, + wchar_to, 6); + if (ret < 0) + goto unknown; + + i += charlen; + dst_char = cpu_to_le16(*wchar_to); + if (charlen <= 3) + /* 1-3bytes UTF-8 to 2bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + else if (charlen == 4) { + /* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16 + * 7-8bytes UTF-8(IVS) divided to 2 UTF-16 + * (charlen=3+4 or 4+4) */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + } else if (charlen >= 5) { + /* 5-6bytes UTF-8 to 6bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 2)); + j++; + put_unaligned(dst_char, &target[j]); } + continue; + +unknown: + dst_char = cpu_to_le16(0x003f); + charlen = 1; } + +ctoUTF16: /* * character may take more than one byte in the source string, * but will take exactly two bytes in the target string