Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

C++ input_file support utf16 #1212

Merged
merged 21 commits into from
Nov 7, 2023
2 changes: 1 addition & 1 deletion .github/workflows/e2e-core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:

- name: Update Docker-compose to v2
run: |
quzard marked this conversation as resolved.
Show resolved Hide resolved
curl -SL https://github.com/docker/compose/releases/download/v2.7.0/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose
sudo curl -SL https://github.com/docker/compose/releases/download/v2.7.0/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose
sudo chmod +x /usr/local/bin/docker-compose

- name: E2E Core Structure Test
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:

- name: Update Docker-compose to v2
run: |
curl -SL https://github.com/docker/compose/releases/download/v2.7.0/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose
sudo curl -SL https://github.com/docker/compose/releases/download/v2.7.0/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose
sudo chmod +x /usr/local/bin/docker-compose

- name: Kernel version
Expand Down
69 changes: 68 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,74 @@
"version": "0.2.0",
"configurations": [
{
"name": "Launch Package",
"name": "(gdb) Launch UT",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceFolder}/core/build/unittest/reader/json_file_reader_unittest",
"args": [],
"stopAtEntry": false,
"cwd": "${workspaceFolder}/core/build/unittest/reader",
"environment": [],
"externalConsole": false,
"MIMode": "gdb",
"setupCommands": [
{
"description": "Enable pretty-printing for gdb",
"text": "-enable-pretty-printing",
"ignoreFailures": true
},
{
"description": "Set Disassembly Flavor to Intel",
"text": "-gdb-set disassembly-flavor intel",
"ignoreFailures": true
}
]
},
{
"name": "(gdb) Launch ilogtail",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceFolder}/bin/ilogtail",
"args": [],
"stopAtEntry": false,
"cwd": "${workspaceFolder}/bin",
"environment": [],
"externalConsole": false,
"MIMode": "gdb",
"setupCommands": [
{
"description": "Enable pretty-printing for gdb",
"text": "-enable-pretty-printing",
"ignoreFailures": true
},
{
"description": "Set Disassembly Flavor to Intel",
"text": "-gdb-set disassembly-flavor intel",
"ignoreFailures": true
}
]
},
{
"name": "(gdb) Attach ilogtail",
"type": "cppdbg",
"request": "attach",
"program": "${workspaceFolder}/bin/ilogtail",
"MIMode": "gdb",
"setupCommands": [
{
"description": "Enable pretty-printing for gdb",
"text": "-enable-pretty-printing",
"ignoreFailures": true
},
{
"description": "Set Disassembly Flavor to Intel",
"text": "-gdb-set disassembly-flavor intel",
"ignoreFailures": true
}
]
},
{
"name": "(go) Launch Plugin Package",
"type": "go",
"request": "launch",
"mode": "debug",
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@ your changes, such as:
- [public] [both] [updated] add a new feature

## [Unreleased]

- [public] [both] [added] support utf-16
121 changes: 121 additions & 0 deletions core/common/EncodingConverter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,14 @@
#include <Windows.h>
#endif

#include <iostream>
using namespace std;
namespace logtail {

#if defined(__linux__)
static iconv_t mGbk2Utf8Cd = (iconv_t)-1;
static iconv_t mUtf16LittleToUtf8Cd = (iconv_t)-1;
static iconv_t mUtf16BigToUtf8Cd = (iconv_t)-1;
#endif

EncodingConverter::EncodingConverter() {
Expand All @@ -34,13 +38,27 @@ EncodingConverter::EncodingConverter() {
LOG_ERROR(sLogger, ("create Gbk2Utf8 iconv descriptor fail, errno", strerror(errno)));
else
iconv(mGbk2Utf8Cd, NULL, NULL, NULL, NULL);
mUtf16LittleToUtf8Cd = iconv_open("UTF-8", "UTF-16LE");
quzard marked this conversation as resolved.
Show resolved Hide resolved
if (mUtf16LittleToUtf8Cd == (iconv_t)(-1))
LOG_ERROR(sLogger, ("create mUtf16LittleToUtf8Cd iconv descriptor fail, errno", strerror(errno)));
else
iconv(mUtf16LittleToUtf8Cd, NULL, NULL, NULL, NULL);
mUtf16BigToUtf8Cd = iconv_open("UTF-8", "UTF-16BE");
if (mUtf16BigToUtf8Cd == (iconv_t)(-1))
LOG_ERROR(sLogger, ("create mUtf16BigToUtf8Cd iconv descriptor fail, errno", strerror(errno)));
else
iconv(mUtf16BigToUtf8Cd, NULL, NULL, NULL, NULL);
#endif
}

EncodingConverter::~EncodingConverter() {
#if defined(__linux__)
if (mGbk2Utf8Cd != (iconv_t)(-1))
iconv_close(mGbk2Utf8Cd);
if (mUtf16LittleToUtf8Cd != (iconv_t)(-1))
iconv_close(mUtf16LittleToUtf8Cd);
if (mUtf16BigToUtf8Cd != (iconv_t)(-1))
iconv_close(mUtf16BigToUtf8Cd);
#endif
}

Expand Down Expand Up @@ -131,6 +149,109 @@ bool EncodingConverter::ConvertGbk2Utf8(
#endif
}

bool EncodingConverter::ConvertUtf16ToUtf8(char16_t* src,
size_t* srcLength,
char*& desOut,
size_t* desLength,
const std::vector<size_t>& linePosVec,
bool isLittleEndian) {
desOut = NULL;
*desLength = 0;

#if defined(__linux__)
if (src == NULL || *srcLength == 0 || mUtf16LittleToUtf8Cd == (iconv_t)(-1) || mUtf16BigToUtf8Cd == (iconv_t)(-1)) {
LOG_ERROR(sLogger,
("invalid iconv descriptor fail or invalid buffer pointer",
"")("mUtf16LittleToUtf8Cd", mUtf16LittleToUtf8Cd)("mUtf16BigToUtf8Cd", mUtf16BigToUtf8Cd));
return false;
}
// utf8 每个字符最大字节数为4
*desLength = *srcLength * 4;
char* des = new char[*srcLength * 4 + 1];
des[*srcLength * 4] = '\0';
desOut = des;
bool rst = true;
char16_t* originSrc = src;
char* originDes = des;
size_t beginIndex = 0;
size_t endIndex = *srcLength;
size_t destIndex = 0;
size_t maxDestSize = *desLength;
for (size_t i = 0; i < linePosVec.size(); ++i) {
endIndex = linePosVec[i];
src = originSrc + beginIndex;
des = originDes + destIndex;
// include '\n'
*srcLength = endIndex - beginIndex + 1;
*desLength = maxDestSize - destIndex;
// UTF16一个Length对应UTF8的2个Length
quzard marked this conversation as resolved.
Show resolved Hide resolved
*srcLength = *srcLength * 2;
if (isLittleEndian) {
size_t ret = iconv(mUtf16LittleToUtf8Cd, (char**)&src, srcLength, &des, desLength);
if (ret == (size_t)(-1)) {
LOG_ERROR(sLogger, ("convert UTF16-LE to UTF8 fail, errno", strerror(errno)));
iconv(mUtf16LittleToUtf8Cd, NULL, NULL, NULL, NULL); // Clear status.
LogtailAlarm::GetInstance()->SendAlarm(ENCODING_CONVERT_ALARM, "convert UTF16 to UTF8 fail");
// use memcpy
memcpy(originDes + destIndex, originSrc + beginIndex, endIndex - beginIndex + 1);
destIndex += endIndex - beginIndex + 1;
} else {
destIndex = des - originDes;
}
beginIndex = src - originSrc;
} else {
size_t ret = iconv(mUtf16BigToUtf8Cd, (char**)&src, srcLength, &des, desLength);
if (ret == (size_t)(-1)) {
LOG_ERROR(sLogger, ("convert UTF16-BE to UTF8 fail, errno", strerror(errno)));
iconv(mUtf16BigToUtf8Cd, NULL, NULL, NULL, NULL); // Clear status.
LogtailAlarm::GetInstance()->SendAlarm(ENCODING_CONVERT_ALARM, "convert UTF16 to UTF8 fail");
// use memcpy
memcpy(originDes + destIndex, originSrc + beginIndex, endIndex - beginIndex + 1);
destIndex += endIndex - beginIndex + 1;
} else {
destIndex = des - originDes;
}
beginIndex = src - originSrc;
}
}
*desLength = destIndex;

return rst;
#elif defined(_MSC_VER)
// swap endianness of UTF-16 BE to UTF-16 LE
if (!isLittleEndian) {
for (size_t i = 0; i < *srcLength; ++i) {
src[i] = (src[i] >> 8) | (src[i] << 8);
}
}
int srcLengthInt = static_cast<int>(*srcLength);

// 计算UTF-8字符串的长度
int size_needed = WideCharToMultiByte(CP_UTF8, 0, (wchar_t*)src, srcLengthInt, NULL, 0, NULL, NULL);
if (size_needed == 0) {
LOG_ERROR(sLogger, ("convert UTF16 to UTF8 fail, WideCharToMultiByte error", GetLastError()));
return false; // conversion failed
}

// 分配足够的内存来存储UTF-8字符串
char* des = new char[size_needed + 1];
if (des == nullptr) {
return false; // memory allocation failed
}
des[size_needed] = '\0';

// 转换UTF-16字符串为UTF-8
if (WideCharToMultiByte(CP_UTF8, 0, (wchar_t*)src, srcLengthInt, des, size_needed, NULL, NULL) == 0) {
LOG_ERROR(sLogger, ("convert UTF16 to UTF8 fail, WideCharToMultiByte error", GetLastError()));
delete[] des;
return false; // conversion failed
}
desOut = des;
*desLength = size_needed;
return true;
#endif
}

std::string EncodingConverter::FromUTF8ToACP(const std::string& s) {
if (s.empty())
return s;
Expand Down
5 changes: 4 additions & 1 deletion core/common/EncodingConverter.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <cstddef>

namespace logtail {
enum FileEncoding { ENCODING_UTF8, ENCODING_GBK };
enum FileEncoding { ENCODING_UTF8, ENCODING_GBK, ENCODING_UTF16 };

class EncodingConverter {
private:
Expand Down Expand Up @@ -51,6 +51,9 @@ class EncodingConverter {
bool
ConvertGbk2Utf8(char* src, size_t* srcLength, char*& des, size_t* desLength, const std::vector<size_t>& linePosVec);

bool ConvertUtf16ToUtf8(
char16_t* src, size_t* srcLength, char*& desOut, size_t* desLength, const std::vector<size_t>& linePosVec, bool isLittleEndian);

// FromUTF8ToACP converts @s encoded in UTF8 to ACP.
// @return ACP string if convert successfully, otherwise @s will be returned.
std::string FromUTF8ToACP(const std::string& s);
Expand Down
4 changes: 4 additions & 0 deletions core/config_manager/ConfigManagerBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,10 @@ void ConfigManagerBase::LoadSingleUserConfig(const std::string& logName, const J
string fileEncoding = GetStringValue(value, "file_encoding", "");
if (ToLowerCaseString(fileEncoding) == "gbk")
config->mFileEncoding = ENCODING_GBK;
else if (ToLowerCaseString(fileEncoding) == "utf16")
{
config->mFileEncoding = ENCODING_UTF16;
}
else
config->mFileEncoding = ENCODING_UTF8;
if (value.isMember("filter_keys") && value.isMember("filter_regs")) {
Expand Down
Loading
Loading