Skip to content

Commit

Permalink
Merge pull request #457 from JetBrains/add_utf_cpp
Browse files Browse the repository at this point in the history
Fix utf support in to_string
  • Loading branch information
ForNeVeR authored Dec 18, 2023
2 parents 310cf51 + 350a1af commit e945f60
Show file tree
Hide file tree
Showing 12 changed files with 956 additions and 64 deletions.
69 changes: 5 additions & 64 deletions rd-cpp/src/rd_core_cpp/src/main/std/to_string.h
Original file line number Diff line number Diff line change
@@ -1,20 +1,15 @@
// ReSharper disable CppUE4CodingStandardNamingViolationWarning
#ifndef RD_CPP_TO_STRING_H
#define RD_CPP_TO_STRING_H

#include <string>
#include <type_traits>
#include <thread>
#include <sstream>
#include <vector>
#include <atomic>
#include <future>
#include <locale>
#if defined(_MSC_VER) || defined(__APPLE__)
#include <codecvt>
#else
#include <limits>
#include <iconv.h>
#endif

#include "ww898/utf_converters.hpp"

#include <thirdparty.hpp>

Expand All @@ -34,63 +29,10 @@ inline std::string to_string(const char* val)
return val;
}

#if defined(_MSC_VER) || defined(__APPLE__)
template<class I, class E, class S>
struct codecvt : std::codecvt<I, E, S>
{
~codecvt()
{ }
};

inline std::string to_string(std::wstring const& val)
{
#if defined(__APPLE__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
using convert_type = codecvt<wchar_t, char, std::mbstate_t>;
std::wstring_convert<convert_type> converter;
return converter.to_bytes(val);
#if defined(__APPLE__)
#pragma clang diagnostic pop
#endif
}
#else
const std::string conv_error("Conversion Error");
inline std::string to_string(const std::wstring& wstr) {
std::string result;
if (wstr.empty()) {
return result;
}
// Order of arguments is to, from
auto icvt = iconv_open("UTF-8", "WCHAR_T");
// CentOS is not happy with -1
if (std::numeric_limits<iconv_t>::max() == icvt) {
return conv_error;
}

// I hope this does not modify the incoming buffer
wchar_t* non_const_in = const_cast<wchar_t*>(wstr.c_str());
char* iconv_in = reinterpret_cast<char*>(non_const_in);
size_t iconv_in_bytes = wstr.length() * sizeof(wchar_t);
// Temp buffer, assume every code point converts into 3 bytes, this should be enough
// We do not convert terminating zeros
const size_t buffer_len = wstr.length() * 3;
auto buffer = std::make_unique<char[]>(buffer_len);

char* iconv_out = buffer.get();
size_t iconv_out_bytes = buffer_len;
auto ret = iconv(icvt, &iconv_in, &iconv_in_bytes, &iconv_out, &iconv_out_bytes);
if (static_cast<size_t>(-1) == ret) {
result = conv_error;
} else {
size_t converted_len = buffer_len - iconv_out_bytes;
result.assign(buffer.get(), converted_len);
}
iconv_close(icvt);
return result;
return ww898::utf::conv<std::string::value_type>(val);
}
#endif

inline std::string to_string(std::thread::id const& id)
{
Expand Down Expand Up @@ -179,8 +121,7 @@ using std::to_wstring;

inline std::wstring to_wstring(std::string const& s)
{
// TO-DO: fix this wrong implementation
return std::wstring(s.begin(), s.end());
return ww898::utf::conv<std::wstring::value_type>(s);
}

template <class T>
Expand Down
4 changes: 4 additions & 0 deletions rd-cpp/thirdparty/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ add_subdirectory(CTPL)
set(SPDLOG_BUILD_SHARED ON CACHE BOOL "Build shared library" FORCE)
add_subdirectory(spdlog)

add_library(utf-cpp INTERFACE)
target_include_directories(utf-cpp INTERFACE utf-cpp/include)

install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/spdlog/include/
DESTINATION "${CMAKE_INSTALL_PUBLIC_HEADER_THIRDPARTY}"
CONFIGURATIONS Release
Expand All @@ -33,6 +36,7 @@ target_link_libraries(thirdparty PUBLIC
ctpl
countdownlatch
spdlog::spdlog
utf-cpp
)

install(FILES thirdparty.hpp
Expand Down
21 changes: 21 additions & 0 deletions rd-cpp/thirdparty/utf-cpp/LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2017 Mikhail Pilin

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
58 changes: 58 additions & 0 deletions rd-cpp/thirdparty/utf-cpp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# UTF-8/16/32 C++ library
This is the C++11 template based header only library under Windows/Linux/MacOs to convert UFT-8/16/32 symbols and strings. The library transparently support `wchar_t` as UTF-16 for Windows and UTF-32 for Linux and MacOs.

UTF-8 and UTF-32 (UCS-32) both support 31 bit wide code points `[0‥0x7FFFFFFF]`with no restriction. UTF-16 supports only unicode code points `[0‥0x10FFFF]`, where high `[0xD800‥0xDBFF]` and low `[0xDC00‥0xDFFF]` surrogate regions are prohibited.

The maximum UTF-16 symbol size is 2 words (4 bytes, both words should be in the surrogate region). UFT-32 (UCS-32) is always 1 word (4 bytes). UTF-8 has the maximum symbol size (see [conversion table](#utf-8-conversion-table) for details):
- 4 bytes for unicode code points
- 6 bytes for 31bit code points

###### UTF-16 surrogate decoder:
|High\Low|DC00|DC01||DFFF|
|:-:|:-:|:-:|:-:|:-:|
|**D800**|010000|010001||0103FF|
|**D801**|010400|010401||0107FF|
|****|||||
|**DBFF**|10FC00|10FC01||10FFFF|

![UTF-16 Surrogates](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b8/Utf-16.svg/512px-Utf-16.svg.png)

## Supported compilers

Tested on following compilers:
- [Visual Studio 2013 v12.0.40629.00 Update 5](perf/vc120_win.md)
- [Visual Studio 2015 v14.0.25431.01 Update 3](perf/vc140_win.md)
- [Visual Studio 2017 v15.6.7](perf/vc141_win.md)
- [Visual Studio 2019 v16.0.3](perf/vc142_win.md)
- [GNU v5.4.0](perf/gnu_linux.md)
- [Clang v6.0.1](perf/clang_linux.md)
- [Apple Clang v10.0.1](perf/clang_mac.md)

## Usage example

```cpp
// यूनिकोड
static char const u8s[] = "\xE0\xA4\xAF\xE0\xA5\x82\xE0\xA4\xA8\xE0\xA4\xBF\xE0\xA4\x95\xE0\xA5\x8B\xE0\xA4\xA1";
using namespace ww898::utf;
std::u16string u16;
convz<utf_selector_t<decltype(*u8s)>, utf16>(u8s, std::back_inserter(u16));
std::u32string u32;
conv<utf16, utf_selector_t<decltype(u32)::value_type>>(u16.begin(), u16.end(), std::back_inserter(u32));
std::vector<char> u8;
convz<utf32, utf8>(u32.data(), std::back_inserter(u8));
std::wstring uw;
conv<utf8, utfw>(u8s, u8s + sizeof(u8s), std::back_inserter(uw));
auto u8r = conv<char>(uw);
auto u16r = conv<char16_t>(u16);
auto uwr = convz<wchar_t>(u8s);

auto u32r = conv<char32_t>(std::string_view(u8r.data(), u8r.size())); // C++17 only

static_assert(std::is_same<utf_selector<decltype(*u8s)>, utf_selector<decltype(u8)::value_type>>::value, "Fail");
static_assert(
std::is_same<utf_selector_t<decltype(u16)::value_type>, utf_selector_t<decltype(uw)::value_type>>::value !=
std::is_same<utf_selector_t<decltype(u32)::value_type>, utf_selector_t<decltype(uw)::value_type>>::value, "Fail");
```
## UTF-8 Conversion table
![UTF-8/32 table](https://upload.wikimedia.org/wikipedia/commons/3/38/UTF-8_Encoding_Scheme.png)
109 changes: 109 additions & 0 deletions rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf16.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
* MIT License
*
* Copyright (c) 2017-2019 Mikhail Pilin
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

#pragma once

#include <cstdint>
#include <stdexcept>

namespace ww898 {
namespace utf {

// 1 0
// 98765432109876543210
// ||||||||||||||||||||
// 110110xxxxxxxxxx|||||||||| high surrogate
// 110111xxxxxxxxxx low surrogate
struct utf16 final
{
static size_t const max_unicode_symbol_size = 2;
static size_t const max_supported_symbol_size = max_unicode_symbol_size;

static uint32_t const max_supported_code_point = 0x10FFFF;

using char_type = uint16_t;

static char_type const min_surrogate = 0xD800;
static char_type const max_surrogate = 0xDFFF;

static char_type const min_surrogate_high = 0xD800;
static char_type const max_surrogate_high = 0xDBFF;

static char_type const min_surrogate_low = 0xDC00;
static char_type const max_surrogate_low = 0xDFFF;

template<typename PeekFn>
static size_t char_size(PeekFn && peek_fn)
{
char_type const ch0 = std::forward<PeekFn>(peek_fn)();
if (ch0 < 0xD800) // [0x0000‥0xD7FF]
return 1;
if (ch0 < 0xDC00) // [0xD800‥0xDBFF] [0xDC00‥0xDFFF]
return 2;
if (ch0 < 0xE000)
throw std::runtime_error("The high utf16 surrogate char is expected");
// [0xE000‥0xFFFF]
return 1;
}

template<typename ReadFn>
static uint32_t read(ReadFn && read_fn)
{
char_type const ch0 = read_fn();
if (ch0 < 0xD800) // [0x0000‥0xD7FF]
return ch0;
if (ch0 < 0xDC00) // [0xD800‥0xDBFF] [0xDC00‥0xDFFF]
{
char_type const ch1 = read_fn(); if (ch1 >> 10 != 0x37) throw std::runtime_error("The low utf16 surrogate char is expected");
return static_cast<uint32_t>((ch0 << 10) + ch1 - 0x35FDC00);
}
if (ch0 < 0xE000)
throw std::runtime_error("The high utf16 surrogate char is expected");
// [0xE000‥0xFFFF]
return ch0;
}

template<typename WriteFn>
static void write(uint32_t const cp, WriteFn && write_fn)
{
if (cp < 0xD800) // [0x0000‥0xD7FF]
write_fn(static_cast<char_type>(cp));
else if (cp < 0x10000)
{
if (cp < 0xE000)
throw std::runtime_error("The utf16 code point can not be in surrogate range");
// [0xE000‥0xFFFF]
write_fn(static_cast<char_type>(cp));
}
else if (cp < 0x110000) // [0xD800‥0xDBFF] [0xDC00‥0xDFFF]
{
write_fn(static_cast<char_type>(0xD7C0 + (cp >> 10 )));
write_fn(static_cast<char_type>(0xDC00 + (cp & 0x3FF)));
}
else
throw std::runtime_error("Too large the utf16 code point");
}
};

}}
67 changes: 67 additions & 0 deletions rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf32.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* MIT License
*
* Copyright (c) 2017-2019 Mikhail Pilin
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

#pragma once

#include <cstdint>
#include <stdexcept>

namespace ww898 {
namespace utf {

struct utf32 final
{
static size_t const max_unicode_symbol_size = 1;
static size_t const max_supported_symbol_size = 1;

static uint32_t const max_supported_code_point = 0x7FFFFFFF;

using char_type = uint32_t;

template<typename PeekFn>
static size_t char_size(PeekFn &&)
{
return 1;
}

template<typename ReadFn>
static uint32_t read(ReadFn && read_fn)
{
char_type const ch = std::forward<ReadFn>(read_fn)();
if (ch < 0x80000000)
return ch;
throw std::runtime_error("Too large utf32 char");
}

template<typename WriteFn>
static void write(uint32_t const cp, WriteFn && write_fn)
{
if (cp < 0x80000000)
std::forward<WriteFn>(write_fn)(static_cast<char_type>(cp));
else
throw std::runtime_error("Too large utf32 code point");
}
};

}}
Loading

0 comments on commit e945f60

Please sign in to comment.