Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feature](load) refactor CSV reading process during scanning, and support enclose and escape for stream load #22539

Merged
merged 28 commits into from
Aug 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c3dcb70
refactor csv reading process, support enclose and escape
TangSiyang2001 Jul 30, 2023
dd38b5b
format
TangSiyang2001 Jul 30, 2023
5c5581c
fix string
TangSiyang2001 Jul 30, 2023
0f043a4
format
TangSiyang2001 Jul 30, 2023
c954510
fix trim and temporarily forbid trimming escape
TangSiyang2001 Jul 31, 2023
97fc89a
fix DCHECK fail
TangSiyang2001 Jul 31, 2023
72bf0a3
fix infinite loop and same prefix case
TangSiyang2001 Jul 31, 2023
20af93c
enable trimming escape
TangSiyang2001 Jul 31, 2023
16e18d2
try to enhance the performance
TangSiyang2001 Aug 2, 2023
ca81976
support-enclosd
TangSiyang2001 Aug 2, 2023
4e810d4
try to enhance performance
TangSiyang2001 Aug 2, 2023
03b6326
try to enhance performance
TangSiyang2001 Aug 2, 2023
b66527a
rm log
TangSiyang2001 Aug 2, 2023
d75d83f
try to enhance performance
TangSiyang2001 Aug 2, 2023
e758bec
fix pos error
TangSiyang2001 Aug 4, 2023
3112359
trade off
TangSiyang2001 Aug 5, 2023
7fd77b4
enhance cases
TangSiyang2001 Aug 6, 2023
27c1d1b
enlarge buf
TangSiyang2001 Aug 7, 2023
2173f63
test two paths
TangSiyang2001 Aug 7, 2023
61e688b
test CRTP
TangSiyang2001 Aug 8, 2023
72bd944
Merge branch 'master' into support-enclose
TangSiyang2001 Aug 8, 2023
fe3951d
try memmem
TangSiyang2001 Aug 8, 2023
aa81df7
test crtp
TangSiyang2001 Aug 8, 2023
d5db1d8
support-enclose
TangSiyang2001 Aug 8, 2023
e3ed590
Merge branch 'apache:master' into support-enclose
TangSiyang2001 Aug 9, 2023
247efd3
Merge branch 'master' into support-enclose
TangSiyang2001 Aug 11, 2023
b570ebc
resolve confict
TangSiyang2001 Aug 11, 2023
46361ba
Merge branch 'apache:master' into support-enclose
TangSiyang2001 Aug 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion be/src/exec/line_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ namespace doris {
namespace io {
class IOContext;
}
// This class is used for CSV scanner, to read content line by line
// This class is used to read content line by line
class LineReader {
public:
virtual ~LineReader() = default;
Expand Down
5 changes: 4 additions & 1 deletion be/src/exec/text_converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,12 @@ TextConverter::TextConverter(char escape_char, char collection_delimiter, char m

void TextConverter::write_string_column(const SlotDescriptor* slot_desc,
vectorized::MutableColumnPtr* column_ptr, const char* data,
size_t len) {
size_t len, bool need_escape) {
DCHECK(column_ptr->get()->is_nullable());
auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(column_ptr->get());
if (need_escape) {
unescape_string_on_spot(data, &len);
}
if ((len == 2 && data[0] == '\\' && data[1] == 'N') || len == SQL_NULL_DATA) {
nullable_column->get_null_map_data().push_back(1);
reinterpret_cast<vectorized::ColumnString&>(nullable_column->get_nested_column())
Expand Down
14 changes: 11 additions & 3 deletions be/src/exec/text_converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

#pragma once

#include <stddef.h>
#include <cstddef>

#include "vec/columns/column.h"

Expand All @@ -33,9 +33,15 @@ class TextConverter {

TextConverter(char escape_char, char collection_delimiter = '\2', char map_kv_delimiter = '\3');

inline void write_string_column(const SlotDescriptor* slot_desc,
vectorized::MutableColumnPtr* column_ptr, const char* data,
size_t len) {
return write_string_column(slot_desc, column_ptr, data, len, false);
}

void write_string_column(const SlotDescriptor* slot_desc,
vectorized::MutableColumnPtr* column_ptr, const char* data,
size_t len);
vectorized::MutableColumnPtr* column_ptr, const char* data, size_t len,
bool need_escape);

inline bool write_column(const SlotDescriptor* slot_desc,
vectorized::MutableColumnPtr* column_ptr, const char* data, size_t len,
Expand All @@ -62,6 +68,8 @@ class TextConverter {
}
void set_map_kv_delimiter(char mapkv_delimiter) { _map_kv_delimiter = mapkv_delimiter; }

inline void set_escape_char(const char escape) { this->_escape_char = escape; }

private:
bool _write_data(const TypeDescriptor& type_desc, vectorized::IColumn* nullable_col_ptr,
const char* data, size_t len, bool copy_string, bool need_escape, size_t rows,
Expand Down
6 changes: 6 additions & 0 deletions be/src/http/action/stream_load.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,12 @@ Status StreamLoadAction::_process_put(HttpRequest* http_req,
if (!http_req->header(HTTP_LINE_DELIMITER).empty()) {
request.__set_line_delimiter(http_req->header(HTTP_LINE_DELIMITER));
}
if (!http_req->header(HTTP_ENCLOSE).empty() && http_req->header(HTTP_ENCLOSE).size() > 0) {
request.__set_enclose(http_req->header(HTTP_ENCLOSE)[0]);
}
if (!http_req->header(HTTP_ESCAPE).empty() && http_req->header(HTTP_ESCAPE).size() > 0) {
request.__set_escape(http_req->header(HTTP_ESCAPE)[0]);
}
if (!http_req->header(HTTP_PARTITIONS).empty()) {
request.__set_partitions(http_req->header(HTTP_PARTITIONS));
request.__set_isTempPartition(false);
Expand Down
2 changes: 2 additions & 0 deletions be/src/http/http_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ static const std::string HTTP_COLUMNS = "columns";
static const std::string HTTP_WHERE = "where";
static const std::string HTTP_COLUMN_SEPARATOR = "column_separator";
static const std::string HTTP_LINE_DELIMITER = "line_delimiter";
static const std::string HTTP_ENCLOSE = "enclose";
static const std::string HTTP_ESCAPE = "escape";
static const std::string HTTP_MAX_FILTER_RATIO = "max_filter_ratio";
static const std::string HTTP_TIMEOUT = "timeout";
static const std::string HTTP_PARTITIONS = "partitions";
Expand Down
Loading
Loading