diff --git a/.github/workflows/pr-approve-status.yml b/.github/workflows/pr-approve-status.yml
index 2a6a64355d5b66e..cda6abc8f1c15b3 100644
--- a/.github/workflows/pr-approve-status.yml
+++ b/.github/workflows/pr-approve-status.yml
@@ -43,9 +43,7 @@ jobs:
approves=()
reviewers_unique=()
for ((i=${#reviewers[@]}-1;i>=0;i--)); do
- # shellcheck disable=SC2076
- # shellcheck disable=SC2199
- if [[ ! "${reviewers_unique[@]}" =~ "${reviewers[$i]}" ]]; then
+ if ! echo "${reviewers_unique[@]}" | grep -q -w "${reviewers[$i]}" && [ "${statuses[$i]}" != "COMMENTED" ]; then
reviewers_unique+=( "${reviewers[$i]}" )
if [ "${statuses[$i]}" == "APPROVED" ]; then
approves+=( "${reviewers[$i]}" )
diff --git a/.gitignore b/.gitignore
index e6981ccb7a7b565..573eaf58baa5ea3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,8 +13,8 @@ core.*
.DS_Store
.classpath
nohup.out
-custom_env.sh
-custom_env_mac.sh
+/custom_env.sh
+/custom_env_mac.sh
derby.log
dependency-reduced-pom.xml
yarn.lock
@@ -33,6 +33,7 @@ package-lock.json
.cache
.settings/
**/.idea/
+!.idea/vcs.xml
**/.vscode/
**/.fleet/
@@ -42,6 +43,7 @@ docs/.temp
# output, thirdparty, extension
output/
+output.bak/
rpc_data/
metastore_db/
thirdparty/src*
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 000000000000000..77fcf54857d40e7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,32 @@
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 1726a676b5ec860..fe5e4b5b8d1320c 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -506,25 +506,6 @@ DEFINE_Int32(min_buffer_size, "1024"); // 1024, The minimum read buffer size (in
// With 1024B through 8MB buffers, this is up to ~2GB of buffers.
DEFINE_Int32(max_free_io_buffers, "128");
-// Whether to disable the memory cache pool,
-// including MemPool, ChunkAllocator, DiskIO free buffer.
-DEFINE_Bool(disable_mem_pools, "false");
-
-// The reserved bytes limit of Chunk Allocator, usually set as a percentage of mem_limit.
-// defaults to bytes if no unit is given, the number of bytes must be a multiple of 2.
-// must larger than 0. and if larger than physical memory size, it will be set to physical memory size.
-// increase this variable can improve performance,
-// but will acquire more free memory which can not be used by other modules.
-DEFINE_mString(chunk_reserved_bytes_limit, "0");
-// 1024, The minimum chunk allocator size (in bytes)
-DEFINE_Int32(min_chunk_reserved_bytes, "1024");
-// Disable Chunk Allocator in Vectorized Allocator, this will reduce memory cache.
-// For high concurrent queries, using Chunk Allocator with vectorized Allocator can reduce the impact
-// of gperftools tcmalloc central lock.
-// Jemalloc or google tcmalloc have core cache, Chunk Allocator may no longer be needed after replacing
-// gperftools tcmalloc.
-DEFINE_mBool(disable_chunk_allocator_in_vec, "true");
-
// The probing algorithm of partitioned hash table.
// Enable quadratic probing hash table
DEFINE_Bool(enable_quadratic_probing, "false");
@@ -1043,6 +1024,9 @@ DEFINE_Bool(enable_set_in_bitmap_value, "false");
DEFINE_Int64(max_hdfs_file_handle_cache_num, "20000");
DEFINE_Int64(max_external_file_meta_cache_num, "20000");
+// max_write_buffer_number for rocksdb
+DEFINE_Int32(rocksdb_max_write_buffer_number, "5");
+
#ifdef BE_TEST
// test s3
DEFINE_String(test_s3_resource, "resource");
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 81b5308132a8447..639d4a350edfbf3 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -543,25 +543,6 @@ DECLARE_Int32(min_buffer_size); // 1024, The minimum read buffer size (in bytes)
// With 1024B through 8MB buffers, this is up to ~2GB of buffers.
DECLARE_Int32(max_free_io_buffers);
-// Whether to disable the memory cache pool,
-// including MemPool, ChunkAllocator, DiskIO free buffer.
-DECLARE_Bool(disable_mem_pools);
-
-// The reserved bytes limit of Chunk Allocator, usually set as a percentage of mem_limit.
-// defaults to bytes if no unit is given, the number of bytes must be a multiple of 2.
-// must larger than 0. and if larger than physical memory size, it will be set to physical memory size.
-// increase this variable can improve performance,
-// but will acquire more free memory which can not be used by other modules.
-DECLARE_mString(chunk_reserved_bytes_limit);
-// 1024, The minimum chunk allocator size (in bytes)
-DECLARE_Int32(min_chunk_reserved_bytes);
-// Disable Chunk Allocator in Vectorized Allocator, this will reduce memory cache.
-// For high concurrent queries, using Chunk Allocator with vectorized Allocator can reduce the impact
-// of gperftools tcmalloc central lock.
-// Jemalloc or google tcmalloc have core cache, Chunk Allocator may no longer be needed after replacing
-// gperftools tcmalloc.
-DECLARE_mBool(disable_chunk_allocator_in_vec);
-
// The probing algorithm of partitioned hash table.
// Enable quadratic probing hash table
DECLARE_Bool(enable_quadratic_probing);
@@ -1059,6 +1040,9 @@ DECLARE_Int64(max_hdfs_file_handle_cache_num);
// max number of meta info of external files, such as parquet footer
DECLARE_Int64(max_external_file_meta_cache_num);
+// max_write_buffer_number for rocksdb
+DECLARE_Int32(rocksdb_max_write_buffer_number);
+
#ifdef BE_TEST
// test s3
DECLARE_String(test_s3_resource);
diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp
index 7c64596bac87d83..cfb002a4c45c17d 100644
--- a/be/src/common/daemon.cpp
+++ b/be/src/common/daemon.cpp
@@ -245,7 +245,7 @@ void Daemon::memory_gc_thread() {
// No longer full gc and minor gc during sleep.
memory_full_gc_sleep_time_ms = config::memory_gc_sleep_time_ms;
memory_minor_gc_sleep_time_ms = config::memory_gc_sleep_time_ms;
- doris::MemTrackerLimiter::print_log_process_usage("process full gc", false);
+ doris::MemTrackerLimiter::print_log_process_usage("Start Full GC", false);
if (doris::MemInfo::process_full_gc()) {
// If there is not enough memory to be gc, the process memory usage will not be printed in the next continuous gc.
doris::MemTrackerLimiter::enable_print_log_process_usage();
@@ -255,7 +255,7 @@ void Daemon::memory_gc_thread() {
proc_mem_no_allocator_cache >= doris::MemInfo::soft_mem_limit())) {
// No minor gc during sleep, but full gc is possible.
memory_minor_gc_sleep_time_ms = config::memory_gc_sleep_time_ms;
- doris::MemTrackerLimiter::print_log_process_usage("process minor gc", false);
+ doris::MemTrackerLimiter::print_log_process_usage("Start Minor GC", false);
if (doris::MemInfo::process_minor_gc()) {
doris::MemTrackerLimiter::enable_print_log_process_usage();
}
diff --git a/be/src/http/action/pprof_actions.cpp b/be/src/http/action/pprof_actions.cpp
index 1d7128127379368..1cbe2163b9374f7 100644
--- a/be/src/http/action/pprof_actions.cpp
+++ b/be/src/http/action/pprof_actions.cpp
@@ -42,7 +42,7 @@
namespace doris {
// pprof default sample time in seconds.
-static const std::string SECOND_KEY = "seconds";
+[[maybe_unused]] static const std::string SECOND_KEY = "seconds";
static const int kPprofDefaultSampleSecs = 30;
// Protect, only one thread can work
diff --git a/be/src/io/fs/benchmark/base_benchmark.h b/be/src/io/fs/benchmark/base_benchmark.h
index c28ad02de508e4a..41dae7cea23e1ef 100644
--- a/be/src/io/fs/benchmark/base_benchmark.h
+++ b/be/src/io/fs/benchmark/base_benchmark.h
@@ -27,6 +27,9 @@
#include
#include "common/status.h"
+#include "io/fs/file_reader.h"
+#include "io/fs/file_writer.h"
+#include "util/slice.h"
namespace doris::io {
@@ -44,24 +47,22 @@ void bm_log(const std::string& fmt, Args&&... args) {
class BaseBenchmark {
public:
BaseBenchmark(const std::string& name, int threads, int iterations, size_t file_size,
- int repetitions, const std::map& conf_map)
+ const std::map& conf_map)
: _name(name),
_threads(threads),
_iterations(iterations),
_file_size(file_size),
- _repetitions(repetitions),
_conf_map(conf_map) {}
virtual ~BaseBenchmark() = default;
virtual Status init() { return Status::OK(); }
virtual Status run(benchmark::State& state) { return Status::OK(); }
+ void set_repetition(int rep) { _repetitions = rep; }
+
void register_bm() {
auto bm = benchmark::RegisterBenchmark(_name.c_str(), [&](benchmark::State& state) {
- Status st;
- if (state.thread_index() == 0) {
- st = this->init();
- }
+ Status st = this->init();
if (st != Status::OK()) {
bm_log("Benchmark {} init error: {}", _name, st.to_string());
return;
@@ -92,12 +93,114 @@ class BaseBenchmark {
});
}
+ virtual std::string get_file_path(benchmark::State& state) {
+ std::string base_dir = _conf_map["base_dir"];
+ std::string file_path;
+ if (base_dir.ends_with("/")) {
+ file_path = fmt::format("{}test_{}", base_dir, state.thread_index());
+ } else {
+ file_path = fmt::format("{}/test_{}", base_dir, state.thread_index());
+ }
+ bm_log("file_path: {}", file_path);
+ return file_path;
+ }
+
+ Status read(benchmark::State& state, FileReaderSPtr reader) {
+ bm_log("begin to read {}", _name);
+ size_t buffer_size =
+ _conf_map.contains("buffer_size") ? std::stol(_conf_map["buffer_size"]) : 1000000L;
+ std::vector buffer;
+ buffer.resize(buffer_size);
+ doris::Slice data = {buffer.data(), buffer.size()};
+ size_t offset = 0;
+ size_t bytes_read = 0;
+
+ size_t read_size = reader->size();
+ if (_file_size > 0) {
+ read_size = std::min(read_size, _file_size);
+ }
+ long remaining_size = read_size;
+
+ Status status;
+ auto start = std::chrono::high_resolution_clock::now();
+ while (remaining_size > 0) {
+ bytes_read = 0;
+ size_t size = std::min(buffer_size, (size_t)remaining_size);
+ data.size = size;
+ status = reader->read_at(offset, data, &bytes_read);
+ if (status != Status::OK() || bytes_read < 0) {
+ bm_log("reader read_at error: {}", status.to_string());
+ break;
+ }
+ if (bytes_read == 0) { // EOF
+ break;
+ }
+ offset += bytes_read;
+ remaining_size -= bytes_read;
+ }
+ auto end = std::chrono::high_resolution_clock::now();
+ auto elapsed_seconds =
+ std::chrono::duration_cast>(end - start);
+ state.SetIterationTime(elapsed_seconds.count());
+ state.counters["ReadRate(B/S)"] =
+ benchmark::Counter(read_size, benchmark::Counter::kIsRate);
+ state.counters["ReadTotal(B)"] = read_size;
+ state.counters["ReadTime(S)"] = elapsed_seconds.count();
+
+ if (status.ok() && reader != nullptr) {
+ status = reader->close();
+ }
+ bm_log("finish to read {}, size {}, seconds: {}, status: {}", _name, read_size,
+ elapsed_seconds.count(), status);
+ return status;
+ }
+
+ Status write(benchmark::State& state, FileWriter* writer) {
+ bm_log("begin to write {}, size: {}", _name, _file_size);
+ size_t write_size = _file_size;
+ size_t buffer_size =
+ _conf_map.contains("buffer_size") ? std::stol(_conf_map["buffer_size"]) : 1000000L;
+ long remaining_size = write_size;
+ std::vector buffer;
+ buffer.resize(buffer_size);
+ doris::Slice data = {buffer.data(), buffer.size()};
+
+ Status status;
+ auto start = std::chrono::high_resolution_clock::now();
+ while (remaining_size > 0) {
+ size_t size = std::min(buffer_size, (size_t)remaining_size);
+ data.size = size;
+ status = writer->append(data);
+ if (status != Status::OK()) {
+ bm_log("writer append error: {}", status.to_string());
+ break;
+ }
+ remaining_size -= size;
+ }
+ if (status.ok() && writer != nullptr) {
+ status = writer->close();
+ }
+
+ auto end = std::chrono::high_resolution_clock::now();
+ auto elapsed_seconds =
+ std::chrono::duration_cast>(end - start);
+ state.SetIterationTime(elapsed_seconds.count());
+ state.counters["WriteRate(B/S)"] =
+ benchmark::Counter(write_size, benchmark::Counter::kIsRate);
+ state.counters["WriteTotal(B)"] = write_size;
+ state.counters["WriteTime(S)"] = elapsed_seconds.count();
+
+ bm_log("finish to write {}, size: {}, seconds: {}, status: {}", _name, write_size,
+ elapsed_seconds.count(), status);
+ return status;
+ }
+
protected:
std::string _name;
int _threads;
int _iterations;
size_t _file_size;
- int _repetitions = 1;
+ int _repetitions = 3;
std::map _conf_map;
};
diff --git a/be/src/io/fs/benchmark/benchmark_factory.hpp b/be/src/io/fs/benchmark/benchmark_factory.hpp
index 3e8c9314ca6dd14..0b8af3b96b5a892 100644
--- a/be/src/io/fs/benchmark/benchmark_factory.hpp
+++ b/be/src/io/fs/benchmark/benchmark_factory.hpp
@@ -38,8 +38,18 @@ Status BenchmarkFactory::getBm(const std::string fs_type, const std::string op_t
const std::map& conf_map,
BaseBenchmark** bm) {
if (fs_type == "s3") {
- if (op_type == "read") {
- *bm = new S3ReadBenchmark(threads, iterations, file_size, conf_map);
+ if (op_type == "create_write") {
+ *bm = new S3CreateWriteBenchmark(threads, iterations, file_size, conf_map);
+ } else if (op_type == "open_read") {
+ *bm = new S3OpenReadBenchmark(threads, iterations, file_size, conf_map);
+ } else if (op_type == "single_read") {
+ *bm = new S3SingleReadBenchmark(threads, iterations, file_size, conf_map);
+ } else if (op_type == "rename") {
+ *bm = new S3RenameBenchmark(threads, iterations, file_size, conf_map);
+ } else if (op_type == "exists") {
+ *bm = new S3ExistsBenchmark(threads, iterations, file_size, conf_map);
+ } else if (op_type == "list") {
+ *bm = new S3ListBenchmark(threads, iterations, file_size, conf_map);
} else {
return Status::Error(
"unknown params: fs_type: {}, op_type: {}, iterations: {}", fs_type, op_type,
diff --git a/be/src/io/fs/benchmark/fs_benchmark_tool.cpp b/be/src/io/fs/benchmark/fs_benchmark_tool.cpp
index a5be5db80a4433a..50085ae1e706d8c 100644
--- a/be/src/io/fs/benchmark/fs_benchmark_tool.cpp
+++ b/be/src/io/fs/benchmark/fs_benchmark_tool.cpp
@@ -20,6 +20,8 @@
#include
#include "io/fs/benchmark/benchmark_factory.hpp"
+#include "io/fs/s3_file_write_bufferpool.h"
+#include "util/threadpool.h"
DEFINE_string(fs_type, "hdfs", "Supported File System: s3, hdfs");
DEFINE_string(operation, "create_write",
@@ -107,6 +109,15 @@ int main(int argc, char** argv) {
return 1;
}
+ // init s3 write buffer pool
+ std::unique_ptr buffered_reader_prefetch_thread_pool;
+ doris::ThreadPoolBuilder("BufferedReaderPrefetchThreadPool")
+ .set_min_threads(16)
+ .set_max_threads(64)
+ .build(&buffered_reader_prefetch_thread_pool);
+ doris::io::S3FileBufferPool* s3_buffer_pool = doris::io::S3FileBufferPool::GetInstance();
+ s3_buffer_pool->init(524288000, 5242880, buffered_reader_prefetch_thread_pool.get());
+
try {
doris::io::MultiBenchmark multi_bm(FLAGS_fs_type, FLAGS_operation, std::stoi(FLAGS_threads),
std::stoi(FLAGS_iterations), std::stol(FLAGS_file_size),
diff --git a/be/src/io/fs/benchmark/hdfs_benchmark.hpp b/be/src/io/fs/benchmark/hdfs_benchmark.hpp
index 1307ddc95a687fc..b508e14a24aa302 100644
--- a/be/src/io/fs/benchmark/hdfs_benchmark.hpp
+++ b/be/src/io/fs/benchmark/hdfs_benchmark.hpp
@@ -33,75 +33,30 @@ class HdfsOpenReadBenchmark : public BaseBenchmark {
public:
HdfsOpenReadBenchmark(int threads, int iterations, size_t file_size,
const std::map& conf_map)
- : BaseBenchmark("HdfsReadBenchmark", threads, iterations, file_size, 3, conf_map) {}
+ : BaseBenchmark("HdfsReadBenchmark", threads, iterations, file_size, conf_map) {}
virtual ~HdfsOpenReadBenchmark() = default;
- Status init() override { return Status::OK(); }
-
- virtual std::string get_file_path(benchmark::State& state) {
- std::string base_dir = _conf_map["base_dir"];
- auto file_path = fmt::format("{}/test_{}", base_dir, state.thread_index());
- bm_log("file_path: {}", file_path);
- return file_path;
+ virtual void set_default_file_size() {
+ if (_file_size <= 0) {
+ _file_size = 10 * 1024 * 1024; // default 10MB
+ }
}
Status run(benchmark::State& state) override {
+ auto file_path = get_file_path(state);
+
+ auto start = std::chrono::high_resolution_clock::now();
std::shared_ptr fs;
io::FileReaderSPtr reader;
- bm_log("begin to init {}", _name);
- size_t buffer_size =
- _conf_map.contains("buffer_size") ? std::stol(_conf_map["buffer_size"]) : 1000000L;
io::FileReaderOptions reader_opts = FileFactory::get_reader_options(nullptr);
THdfsParams hdfs_params = parse_properties(_conf_map);
-
- auto file_path = get_file_path(state);
RETURN_IF_ERROR(
FileFactory::create_hdfs_reader(hdfs_params, file_path, &fs, &reader, reader_opts));
- bm_log("finish to init {}", _name);
-
- bm_log("begin to run {}", _name);
- Status status;
- std::vector buffer;
- buffer.resize(buffer_size);
- doris::Slice data = {buffer.data(), buffer.size()};
- size_t offset = 0;
- size_t bytes_read = 0;
-
- size_t read_size = reader->size();
- if (_file_size > 0) {
- read_size = std::min(read_size, _file_size);
- }
- long remaining_size = read_size;
-
- auto start = std::chrono::high_resolution_clock::now();
- while (remaining_size > 0) {
- bytes_read = 0;
- size_t size = std::min(buffer_size, (size_t)remaining_size);
- data.size = size;
- status = reader->read_at(offset, data, &bytes_read);
- if (status != Status::OK() || bytes_read < 0) {
- bm_log("reader read_at error: {}", status.to_string());
- break;
- }
- if (bytes_read == 0) { // EOF
- break;
- }
- offset += bytes_read;
- remaining_size -= bytes_read;
- }
- bm_log("finish to run {}", _name);
auto end = std::chrono::high_resolution_clock::now();
-
auto elapsed_seconds =
std::chrono::duration_cast>(end - start);
-
- state.SetIterationTime(elapsed_seconds.count());
- state.counters["ReadRate"] = benchmark::Counter(read_size, benchmark::Counter::kIsRate);
-
- if (reader != nullptr) {
- reader->close();
- }
- return status;
+ state.counters["OpenReaderTime(S)"] = elapsed_seconds.count();
+ return read(state, reader);
}
};
@@ -113,6 +68,10 @@ class HdfsSingleReadBenchmark : public HdfsOpenReadBenchmark {
: HdfsOpenReadBenchmark(threads, iterations, file_size, conf_map) {}
virtual ~HdfsSingleReadBenchmark() = default;
+ virtual void set_default_file_size() override {
+ // do nothing, default is 0, which means it will read the whole file
+ }
+
virtual std::string get_file_path(benchmark::State& state) override {
std::string file_path = _conf_map["file_path"];
bm_log("file_path: {}", file_path);
@@ -124,56 +83,20 @@ class HdfsCreateWriteBenchmark : public BaseBenchmark {
public:
HdfsCreateWriteBenchmark(int threads, int iterations, size_t file_size,
const std::map& conf_map)
- : BaseBenchmark("HdfsCreateWriteBenchmark", threads, iterations, file_size, 3,
- conf_map) {}
+ : BaseBenchmark("HdfsCreateWriteBenchmark", threads, iterations, file_size, conf_map) {}
virtual ~HdfsCreateWriteBenchmark() = default;
- Status init() override { return Status::OK(); }
-
Status run(benchmark::State& state) override {
- bm_log("begin to run {}", _name);
- std::string base_dir = _conf_map["base_dir"];
- io::FileReaderOptions reader_opts = FileFactory::get_reader_options(nullptr);
- THdfsParams hdfs_params = parse_properties(_conf_map);
- auto file_path = fmt::format("{}/test_{}", base_dir, state.thread_index());
- bm_log("file_path: {}", file_path);
-
- auto start = std::chrono::high_resolution_clock::now();
+ auto file_path = get_file_path(state);
+ if (_file_size <= 0) {
+ _file_size = 10 * 1024 * 1024; // default 10MB
+ }
std::shared_ptr fs;
io::FileWriterPtr writer;
+ THdfsParams hdfs_params = parse_properties(_conf_map);
RETURN_IF_ERROR(io::HdfsFileSystem::create(hdfs_params, "", &fs));
RETURN_IF_ERROR(fs->create_file(file_path, &writer));
- Status status;
- size_t write_size = _file_size;
- size_t buffer_size =
- _conf_map.contains("buffer_size") ? std::stol(_conf_map["buffer_size"]) : 1000000L;
- long remaining_size = write_size;
- std::vector buffer;
- buffer.resize(buffer_size);
- doris::Slice data = {buffer.data(), buffer.size()};
- while (remaining_size > 0) {
- size_t size = std::min(buffer_size, (size_t)remaining_size);
- data.size = size;
- status = writer->append(data);
- if (status != Status::OK()) {
- bm_log("writer append error: {}", status.to_string());
- break;
- }
- remaining_size -= size;
- }
- auto end = std::chrono::high_resolution_clock::now();
- auto elapsed_seconds =
- std::chrono::duration_cast>(end - start);
-
- state.SetIterationTime(elapsed_seconds.count());
- bm_log("finish to run {}", _name);
-
- state.counters["WriteRate"] = benchmark::Counter(write_size, benchmark::Counter::kIsRate);
-
- if (writer != nullptr) {
- writer->close();
- }
- return status;
+ return write(state, writer.get());
}
};
@@ -181,75 +104,56 @@ class HdfsRenameBenchmark : public BaseBenchmark {
public:
HdfsRenameBenchmark(int threads, int iterations, size_t file_size,
const std::map& conf_map)
- : BaseBenchmark("HdfsRenameBenchmark", threads, 1, file_size, 1, conf_map) {}
+ : BaseBenchmark("HdfsRenameBenchmark", threads, iterations, file_size, conf_map) {
+ // rename can only do once
+ set_repetition(1);
+ }
virtual ~HdfsRenameBenchmark() = default;
- Status init() override { return Status::OK(); }
-
Status run(benchmark::State& state) override {
- bm_log("begin to run {}", _name);
- std::string base_dir = _conf_map["base_dir"];
- io::FileReaderOptions reader_opts = FileFactory::get_reader_options(nullptr);
+ auto file_path = get_file_path(state);
+ auto new_file_path = file_path + "_new";
THdfsParams hdfs_params = parse_properties(_conf_map);
- auto file_path = fmt::format("{}/test_{}", base_dir, state.thread_index());
- auto new_file_path = fmt::format("{}/test_{}_new", base_dir, state.thread_index());
- bm_log("file_path: {}", file_path);
-
- auto start = std::chrono::high_resolution_clock::now();
std::shared_ptr fs;
- io::FileWriterPtr writer;
RETURN_IF_ERROR(io::HdfsFileSystem::create(hdfs_params, "", &fs));
+
+ auto start = std::chrono::high_resolution_clock::now();
RETURN_IF_ERROR(fs->rename(file_path, new_file_path));
auto end = std::chrono::high_resolution_clock::now();
auto elapsed_seconds =
std::chrono::duration_cast>(end - start);
-
state.SetIterationTime(elapsed_seconds.count());
- bm_log("finish to run {}", _name);
-
state.counters["RenameCost"] =
benchmark::Counter(1, benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
- if (writer != nullptr) {
- writer->close();
- }
return Status::OK();
}
-
-private:
};
class HdfsExistsBenchmark : public BaseBenchmark {
public:
HdfsExistsBenchmark(int threads, int iterations, size_t file_size,
const std::map& conf_map)
- : BaseBenchmark("HdfsExistsBenchmark", threads, iterations, file_size, 3, conf_map) {}
+ : BaseBenchmark("HdfsExistsBenchmark", threads, iterations, file_size, conf_map) {}
virtual ~HdfsExistsBenchmark() = default;
- Status init() override { return Status::OK(); }
-
Status run(benchmark::State& state) override {
- bm_log("begin to run {}", _name);
- std::string base_dir = _conf_map["base_dir"];
- io::FileReaderOptions reader_opts = FileFactory::get_reader_options(nullptr);
- THdfsParams hdfs_params = parse_properties(_conf_map);
- auto file_path = fmt::format("{}/test_{}", base_dir, state.thread_index());
- bm_log("file_path: {}", file_path);
+ auto file_path = get_file_path(state);
- auto start = std::chrono::high_resolution_clock::now();
std::shared_ptr fs;
+ THdfsParams hdfs_params = parse_properties(_conf_map);
RETURN_IF_ERROR(io::HdfsFileSystem::create(hdfs_params, "", &fs));
+
+ auto start = std::chrono::high_resolution_clock::now();
bool res = false;
RETURN_IF_ERROR(fs->exists(file_path, &res));
auto end = std::chrono::high_resolution_clock::now();
auto elapsed_seconds =
std::chrono::duration_cast>(end - start);
-
state.SetIterationTime(elapsed_seconds.count());
- bm_log("finish to run {}", _name);
-
state.counters["ExistsCost"] =
benchmark::Counter(1, benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
+
return Status::OK();
}
};
diff --git a/be/src/io/fs/benchmark/s3_benchmark.hpp b/be/src/io/fs/benchmark/s3_benchmark.hpp
index 7e958cefdbc3837..c2ee8ddd99d1cb9 100644
--- a/be/src/io/fs/benchmark/s3_benchmark.hpp
+++ b/be/src/io/fs/benchmark/s3_benchmark.hpp
@@ -19,41 +19,193 @@
#include "io/file_factory.h"
#include "io/fs/benchmark/base_benchmark.h"
+#include "io/fs/file_writer.h"
#include "io/fs/s3_file_reader.h"
#include "io/fs/s3_file_system.h"
+#include "runtime/exec_env.h"
+#include "util/s3_uri.h"
#include "util/slice.h"
namespace doris::io {
-class S3ReadBenchmark : public BaseBenchmark {
+class S3Benchmark : public BaseBenchmark {
public:
- S3ReadBenchmark(int threads, int iterations, size_t file_size,
- const std::map& conf_map)
- : BaseBenchmark("S3ReadBenchmark", threads, iterations, file_size, 3, conf_map),
- _result(buffer, 128) {}
- virtual ~S3ReadBenchmark() = default;
+ S3Benchmark(const std::string& name, int threads, int iterations, size_t file_size,
+ const std::map& conf_map)
+ : BaseBenchmark(name, threads, iterations, file_size, conf_map) {}
+ virtual ~S3Benchmark() = default;
- Status init() override {
- bm_log("begin to init {}", _name);
- std::string file_path = _conf_map["file"];
- io::FileReaderOptions reader_opts = FileFactory::get_reader_options(nullptr);
+ Status get_fs(const std::string& path) {
+ S3URI s3_uri(path);
+ RETURN_IF_ERROR(s3_uri.parse());
RETURN_IF_ERROR(
- FileFactory::create_s3_reader(_conf_map, file_path, &_fs, &_reader, reader_opts));
- bm_log("finish to init {}", _name);
+ S3ClientFactory::convert_properties_to_s3_conf(_conf_map, s3_uri, &_s3_conf));
+ return io::S3FileSystem::create(std::move(_s3_conf), "", &_fs);
+ }
+
+protected:
+ doris::S3Conf _s3_conf;
+ std::shared_ptr _fs;
+};
+
+class S3OpenReadBenchmark : public S3Benchmark {
+public:
+ S3OpenReadBenchmark(int threads, int iterations, size_t file_size,
+ const std::map& conf_map)
+ : S3Benchmark("S3ReadBenchmark", threads, iterations, file_size, conf_map) {}
+ virtual ~S3OpenReadBenchmark() = default;
+
+ virtual void set_default_file_size() {
+ if (_file_size <= 0) {
+ _file_size = 10 * 1024 * 1024; // default 10MB
+ }
+ }
+
+ Status run(benchmark::State& state) override {
+ auto file_path = get_file_path(state);
+ RETURN_IF_ERROR(get_fs(file_path));
+
+ io::FileReaderSPtr reader;
+ io::FileReaderOptions reader_opts = FileFactory::get_reader_options(nullptr);
+ RETURN_IF_ERROR(FileFactory::create_s3_reader(
+ _conf_map, file_path, reinterpret_cast*>(&_fs),
+ &reader, reader_opts));
+
+ return read(state, reader);
+ }
+};
+
+// Read a single specified file
+class S3SingleReadBenchmark : public S3OpenReadBenchmark {
+public:
+ S3SingleReadBenchmark(int threads, int iterations, size_t file_size,
+ const std::map& conf_map)
+ : S3OpenReadBenchmark(threads, iterations, file_size, conf_map) {}
+ virtual ~S3SingleReadBenchmark() = default;
+
+ virtual void set_default_file_size() override {}
+
+ virtual std::string get_file_path(benchmark::State& state) override {
+ std::string file_path = _conf_map["file_path"];
+ bm_log("file_path: {}", file_path);
+ return file_path;
+ }
+};
+
+class S3CreateWriteBenchmark : public S3Benchmark {
+public:
+ S3CreateWriteBenchmark(int threads, int iterations, size_t file_size,
+ const std::map& conf_map)
+ : S3Benchmark("S3CreateWriteBenchmark", threads, iterations, file_size, conf_map) {}
+ virtual ~S3CreateWriteBenchmark() = default;
+
+ Status run(benchmark::State& state) override {
+ auto file_path = get_file_path(state);
+ if (_file_size <= 0) {
+ _file_size = 10 * 1024 * 1024; // default 10MB
+ }
+ RETURN_IF_ERROR(get_fs(file_path));
+
+ io::FileWriterPtr writer;
+ RETURN_IF_ERROR(_fs->create_file(file_path, &writer));
+ return write(state, writer.get());
+ }
+};
+
+class S3ListBenchmark : public S3Benchmark {
+public:
+ S3ListBenchmark(int threads, int iterations, size_t file_size,
+ const std::map& conf_map)
+ : S3Benchmark("S3ListBenchmark", threads, iterations, file_size, conf_map) {}
+ virtual ~S3ListBenchmark() = default;
+
+ virtual std::string get_file_path(benchmark::State& state) override {
+ return _conf_map["base_dir"];
+ }
+
+ Status run(benchmark::State& state) override {
+ auto file_path = get_file_path(state);
+ RETURN_IF_ERROR(get_fs(file_path));
+
+ auto start = std::chrono::high_resolution_clock::now();
+ std::vector files;
+ bool exists = true;
+ RETURN_IF_ERROR(_fs->list(file_path, true, &files, &exists));
+ auto end = std::chrono::high_resolution_clock::now();
+ auto elapsed_seconds =
+ std::chrono::duration_cast>(end - start);
+ state.SetIterationTime(elapsed_seconds.count());
+ state.counters["ListCost"] =
+ benchmark::Counter(1, benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
+
+ std::stringstream ss;
+ int i = 0;
+ for (auto& file_info : files) {
+ if (i > 2) {
+ break;
+ }
+ ++i;
+ ss << "[" << file_info.file_name << ", " << file_info.file_size << ", "
+ << file_info.is_file << "] ";
+ }
+ bm_log("list files: {}", ss.str());
+
return Status::OK();
}
+};
+
+class S3RenameBenchmark : public S3Benchmark {
+public:
+ S3RenameBenchmark(int threads, int iterations, size_t file_size,
+ const std::map& conf_map)
+ : S3Benchmark("S3RenameBenchmark", threads, iterations, file_size, conf_map) {
+ // rename can only do once
+ set_repetition(1);
+ }
+
+ virtual ~S3RenameBenchmark() = default;
Status run(benchmark::State& state) override {
- return _reader->read_at(0, _result, &_bytes_read);
+ auto file_path = get_file_path(state);
+ auto new_file_path = file_path + "_new";
+ RETURN_IF_ERROR(get_fs(file_path));
+
+ auto start = std::chrono::high_resolution_clock::now();
+ RETURN_IF_ERROR(_fs->rename(file_path, new_file_path));
+ auto end = std::chrono::high_resolution_clock::now();
+ auto elapsed_seconds =
+ std::chrono::duration_cast>(end - start);
+ state.SetIterationTime(elapsed_seconds.count());
+ state.counters["RenameCost"] =
+ benchmark::Counter(1, benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
+
+ return Status::OK();
}
+};
-private:
- doris::S3Conf _s3_conf;
- std::shared_ptr _fs;
- io::FileReaderSPtr _reader;
- char buffer[128];
- doris::Slice _result;
- size_t _bytes_read = 0;
+class S3ExistsBenchmark : public S3Benchmark {
+public:
+ S3ExistsBenchmark(int threads, int iterations, size_t file_size,
+ const std::map& conf_map)
+ : S3Benchmark("S3ExistsBenchmark", threads, iterations, file_size, conf_map) {}
+ virtual ~S3ExistsBenchmark() = default;
+
+ Status run(benchmark::State& state) override {
+ auto file_path = get_file_path(state);
+ RETURN_IF_ERROR(get_fs(file_path));
+
+ auto start = std::chrono::high_resolution_clock::now();
+ bool res = false;
+ RETURN_IF_ERROR(_fs->exists(file_path, &res));
+ auto end = std::chrono::high_resolution_clock::now();
+ auto elapsed_seconds =
+ std::chrono::duration_cast>(end - start);
+ state.SetIterationTime(elapsed_seconds.count());
+ state.counters["ExistsCost"] =
+ benchmark::Counter(1, benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
+
+ return Status::OK();
+ }
};
} // namespace doris::io
diff --git a/be/src/io/fs/buffered_reader.cpp b/be/src/io/fs/buffered_reader.cpp
index 6dcbad96b7c1d16..e6093612baf0d74 100644
--- a/be/src/io/fs/buffered_reader.cpp
+++ b/be/src/io/fs/buffered_reader.cpp
@@ -22,6 +22,7 @@
#include
#include
+#include
// IWYU pragma: no_include
#include "common/compiler_util.h" // IWYU pragma: keep
@@ -363,13 +364,23 @@ Status MergeRangeFileReader::_fill_box(int range_index, size_t start_offset, siz
return Status::OK();
}
+// the condition variable would wait at most 10 seconds
+// otherwise it would quit the procedure and treat it
+// as one time out error status and would make the load
+// task failed
+constexpr static int WAIT_TIME_OUT_MS = 10000;
+
// there exists occasions where the buffer is already closed but
// some prior tasks are still queued in thread pool, so we have to check whether
// the buffer is closed each time the condition variable is notified.
void PrefetchBuffer::reset_offset(size_t offset) {
{
std::unique_lock lck {_lock};
- _prefetched.wait(lck, [this]() { return _buffer_status != BufferStatus::PENDING; });
+ if (!_prefetched.wait_for(lck, std::chrono::milliseconds(WAIT_TIME_OUT_MS),
+ [this]() { return _buffer_status != BufferStatus::PENDING; })) {
+ _prefetch_status = Status::TimedOut("time out when reset prefetch buffer");
+ return;
+ }
if (UNLIKELY(_buffer_status == BufferStatus::CLOSED)) {
_prefetched.notify_all();
return;
@@ -393,9 +404,13 @@ void PrefetchBuffer::reset_offset(size_t offset) {
void PrefetchBuffer::prefetch_buffer() {
{
std::unique_lock lck {_lock};
- _prefetched.wait(lck, [this]() {
- return _buffer_status == BufferStatus::RESET || _buffer_status == BufferStatus::CLOSED;
- });
+ if (!_prefetched.wait_for(lck, std::chrono::milliseconds(WAIT_TIME_OUT_MS), [this]() {
+ return _buffer_status == BufferStatus::RESET ||
+ _buffer_status == BufferStatus::CLOSED;
+ })) {
+ _prefetch_status = Status::TimedOut("time out when invoking prefetch buffer");
+ return;
+ }
// in case buffer is already closed
if (UNLIKELY(_buffer_status == BufferStatus::CLOSED)) {
_prefetched.notify_all();
@@ -432,7 +447,11 @@ void PrefetchBuffer::prefetch_buffer() {
_statis.prefetch_request_io += 1;
_statis.prefetch_request_bytes += _len;
std::unique_lock lck {_lock};
- _prefetched.wait(lck, [this]() { return _buffer_status == BufferStatus::PENDING; });
+ if (!_prefetched.wait_for(lck, std::chrono::milliseconds(WAIT_TIME_OUT_MS),
+ [this]() { return _buffer_status == BufferStatus::PENDING; })) {
+ _prefetch_status = Status::TimedOut("time out when invoking prefetch buffer");
+ return;
+ }
if (!s.ok() && _offset < _reader->size()) {
_prefetch_status = std::move(s);
}
@@ -509,10 +528,13 @@ Status PrefetchBuffer::read_buffer(size_t off, const char* out, size_t buf_len,
{
std::unique_lock lck {_lock};
// buffer must be prefetched or it's closed
- _prefetched.wait(lck, [this]() {
- return _buffer_status == BufferStatus::PREFETCHED ||
- _buffer_status == BufferStatus::CLOSED;
- });
+ if (!_prefetched.wait_for(lck, std::chrono::milliseconds(WAIT_TIME_OUT_MS), [this]() {
+ return _buffer_status == BufferStatus::PREFETCHED ||
+ _buffer_status == BufferStatus::CLOSED;
+ })) {
+ _prefetch_status = Status::TimedOut("time out when read prefetch buffer");
+ return _prefetch_status;
+ }
if (UNLIKELY(BufferStatus::CLOSED == _buffer_status)) {
return Status::OK();
}
@@ -545,7 +567,11 @@ Status PrefetchBuffer::read_buffer(size_t off, const char* out, size_t buf_len,
void PrefetchBuffer::close() {
std::unique_lock lck {_lock};
// in case _reader still tries to write to the buf after we close the buffer
- _prefetched.wait(lck, [this]() { return _buffer_status != BufferStatus::PENDING; });
+ if (!_prefetched.wait_for(lck, std::chrono::milliseconds(WAIT_TIME_OUT_MS),
+ [this]() { return _buffer_status != BufferStatus::PENDING; })) {
+ _prefetch_status = Status::TimedOut("time out when close prefetch buffer");
+ return;
+ }
_buffer_status = BufferStatus::CLOSED;
_prefetched.notify_all();
if (_sync_profile != nullptr) {
diff --git a/be/src/io/fs/s3_file_write_bufferpool.cpp b/be/src/io/fs/s3_file_write_bufferpool.cpp
index c6ec1a8b60c8a19..48887f9c6ea6a75 100644
--- a/be/src/io/fs/s3_file_write_bufferpool.cpp
+++ b/be/src/io/fs/s3_file_write_bufferpool.cpp
@@ -24,6 +24,7 @@
#include "io/fs/s3_common.h"
#include "runtime/exec_env.h"
#include "util/defer_op.h"
+#include "util/threadpool.h"
namespace doris {
namespace io {
@@ -59,26 +60,27 @@ void S3FileBuffer::submit() {
_stream_ptr = std::make_shared(_buf.data, _size);
}
- ExecEnv::GetInstance()->buffered_reader_prefetch_thread_pool()->submit_func(
- [buf = this->shared_from_this()]() { buf->_on_upload(); });
+ _thread_pool->submit_func([buf = this->shared_from_this()]() { buf->_on_upload(); });
}
-S3FileBufferPool::S3FileBufferPool() {
+void S3FileBufferPool::init(int32_t s3_write_buffer_whole_size, int32_t s3_write_buffer_size,
+ doris::ThreadPool* thread_pool) {
// the nums could be one configuration
- size_t buf_num = config::s3_write_buffer_whole_size / config::s3_write_buffer_size;
- DCHECK((config::s3_write_buffer_size >= 5 * 1024 * 1024) &&
- (config::s3_write_buffer_whole_size > config::s3_write_buffer_size));
+ size_t buf_num = s3_write_buffer_whole_size / s3_write_buffer_size;
+ DCHECK((s3_write_buffer_size >= 5 * 1024 * 1024) &&
+ (s3_write_buffer_whole_size > s3_write_buffer_size));
LOG_INFO("S3 file buffer pool with {} buffers", buf_num);
- _whole_mem_buffer = std::make_unique(config::s3_write_buffer_whole_size);
+ _whole_mem_buffer = std::make_unique(s3_write_buffer_whole_size);
for (size_t i = 0; i < buf_num; i++) {
- Slice s {_whole_mem_buffer.get() + i * config::s3_write_buffer_size,
- static_cast(config::s3_write_buffer_size)};
+ Slice s {_whole_mem_buffer.get() + i * s3_write_buffer_size,
+ static_cast(s3_write_buffer_size)};
_free_raw_buffers.emplace_back(s);
}
+ _thread_pool = thread_pool;
}
std::shared_ptr S3FileBufferPool::allocate(bool reserve) {
- std::shared_ptr buf = std::make_shared();
+ std::shared_ptr buf = std::make_shared(_thread_pool);
// if need reserve then we must ensure return buf with memory preserved
if (reserve) {
{
diff --git a/be/src/io/fs/s3_file_write_bufferpool.h b/be/src/io/fs/s3_file_write_bufferpool.h
index b69964b48e3ad45..55fa53df4287788 100644
--- a/be/src/io/fs/s3_file_write_bufferpool.h
+++ b/be/src/io/fs/s3_file_write_bufferpool.h
@@ -31,13 +31,14 @@
#include "util/slice.h"
namespace doris {
+class ThreadPool;
namespace io {
// TODO(AlexYue): 1. support write into cache 2. unify write buffer and read buffer
struct S3FileBuffer : public std::enable_shared_from_this {
using Callback = std::function;
- S3FileBuffer() = default;
+ S3FileBuffer(ThreadPool* pool) { _thread_pool = pool; }
~S3FileBuffer() = default;
void rob_buffer(std::shared_ptr& other) {
@@ -104,19 +105,26 @@ struct S3FileBuffer : public std::enable_shared_from_this {
// caller of this buf could use this callback to do syncronization
Callback _on_finish_upload = nullptr;
Status _status;
- size_t _offset;
- size_t _size;
+ size_t _offset {0};
+ size_t _size {0};
std::shared_ptr _stream_ptr;
// only served as one reserved buffer
Slice _buf;
size_t _append_offset {0};
+ // not owned
+ ThreadPool* _thread_pool = nullptr;
};
class S3FileBufferPool {
public:
- S3FileBufferPool();
+ S3FileBufferPool() = default;
~S3FileBufferPool() = default;
+ // should be called one and only once
+ // at startup
+ void init(int32_t s3_write_buffer_whole_size, int32_t s3_write_buffer_size,
+ doris::ThreadPool* thread_pool);
+
static S3FileBufferPool* GetInstance() {
static S3FileBufferPool _pool;
return &_pool;
@@ -135,6 +143,8 @@ class S3FileBufferPool {
std::condition_variable _cv;
std::unique_ptr _whole_mem_buffer;
std::list _free_raw_buffers;
+ // not owned
+ ThreadPool* _thread_pool = nullptr;
};
} // namespace io
} // namespace doris
diff --git a/be/src/io/fs/s3_file_writer.h b/be/src/io/fs/s3_file_writer.h
index 0716ac635637aaf..d8956da88acad3f 100644
--- a/be/src/io/fs/s3_file_writer.h
+++ b/be/src/io/fs/s3_file_writer.h
@@ -57,8 +57,6 @@ class S3FileWriter final : public FileWriter {
return Status::NotSupported("not support");
}
- size_t bytes_appended() const { return _bytes_appended; }
-
int64_t upload_cost_ms() const { return *_upload_cost_ms; }
private:
@@ -115,7 +113,6 @@ class S3FileWriter final : public FileWriter {
std::shared_ptr _client;
std::string _upload_id;
- size_t _bytes_appended {0};
// Current Part Num for CompletedPart
int _cur_part_num = 1;
diff --git a/be/src/olap/block_column_predicate.cpp b/be/src/olap/block_column_predicate.cpp
index ff99038012e7fd2..8cfb89363cdd245 100644
--- a/be/src/olap/block_column_predicate.cpp
+++ b/be/src/olap/block_column_predicate.cpp
@@ -52,6 +52,11 @@ bool SingleColumnBlockPredicate::evaluate_and(const segment_v2::BloomFilter* bf)
return _predicate->evaluate_and(bf);
}
+bool SingleColumnBlockPredicate::evaluate_and(const StringRef* dict_words,
+ const size_t dict_num) const {
+ return _predicate->evaluate_and(dict_words, dict_num);
+}
+
void SingleColumnBlockPredicate::evaluate_or(vectorized::MutableColumns& block, uint16_t* sel,
uint16_t selected_size, bool* flags) const {
auto column_id = _predicate->column_id();
@@ -158,6 +163,16 @@ bool AndBlockColumnPredicate::evaluate_and(const segment_v2::BloomFilter* bf) co
return true;
}
+bool AndBlockColumnPredicate::evaluate_and(const StringRef* dict_words,
+ const size_t dict_num) const {
+ for (auto* predicate : _block_column_predicate_vec) {
+ if (!predicate->evaluate_and(dict_words, dict_num)) {
+ return false;
+ }
+ }
+ return true;
+}
+
void AndBlockColumnPredicate::evaluate_or(vectorized::MutableColumns& block, uint16_t* sel,
uint16_t selected_size, bool* flags) const {
if (num_of_column_predicate() == 1) {
diff --git a/be/src/olap/block_column_predicate.h b/be/src/olap/block_column_predicate.h
index 467962e809cfe47..0069a62d2957b6b 100644
--- a/be/src/olap/block_column_predicate.h
+++ b/be/src/olap/block_column_predicate.h
@@ -81,6 +81,12 @@ class BlockColumnPredicate {
LOG(FATAL) << "should not reach here";
return true;
}
+
+ virtual bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const {
+ LOG(FATAL) << "should not reach here";
+ return true;
+ }
+
virtual bool can_do_bloom_filter() const { return false; }
//evaluate predicate on inverted
@@ -109,6 +115,7 @@ class SingleColumnBlockPredicate : public BlockColumnPredicate {
bool* flags) const override;
bool evaluate_and(const std::pair& statistic) const override;
bool evaluate_and(const segment_v2::BloomFilter* bf) const override;
+ bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const override;
void evaluate_or(vectorized::MutableColumns& block, uint16_t* sel, uint16_t selected_size,
bool* flags) const override;
@@ -179,6 +186,8 @@ class AndBlockColumnPredicate : public MutilColumnBlockPredicate {
bool evaluate_and(const segment_v2::BloomFilter* bf) const override;
+ bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const override;
+
bool can_do_bloom_filter() const override {
for (auto& pred : _block_column_predicate_vec) {
if (!pred->can_do_bloom_filter()) {
diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h
index 31dbea7e56ad048..88f40c92c112c4f 100644
--- a/be/src/olap/column_predicate.h
+++ b/be/src/olap/column_predicate.h
@@ -179,6 +179,10 @@ class ColumnPredicate {
virtual bool evaluate_and(const BloomFilter* bf) const { return true; }
+ virtual bool evaluate_and(const StringRef* dict_words, const size_t dict_count) const {
+ return true;
+ }
+
virtual bool can_do_bloom_filter() const { return false; }
// used to evaluate pre read column in lazy materialization
diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h
index 00de4632e7979c3..6524fdfc7df0029 100644
--- a/be/src/olap/comparison_predicate.h
+++ b/be/src/olap/comparison_predicate.h
@@ -259,6 +259,19 @@ class ComparisonPredicateBase : public ColumnPredicate {
}
}
+ bool evaluate_and(const StringRef* dict_words, const size_t count) const override {
+ if constexpr (std::is_same_v) {
+ for (size_t i = 0; i != count; ++i) {
+ if (_operator(dict_words[i], _value) ^ _opposite) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ return true;
+ }
+
bool can_do_bloom_filter() const override { return PT == PredicateType::EQ; }
void evaluate_or(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size,
diff --git a/be/src/olap/delta_writer.cpp b/be/src/olap/delta_writer.cpp
index 16fc02dbc0f8c3a..3c84e67103d78e5 100644
--- a/be/src/olap/delta_writer.cpp
+++ b/be/src/olap/delta_writer.cpp
@@ -199,7 +199,7 @@ Status DeltaWriter::init() {
context.segments_overlap = OVERLAPPING;
context.tablet_schema = _tablet_schema;
context.newest_write_timestamp = UnixSeconds();
- context.tablet_id = _tablet->table_id();
+ context.tablet_id = _tablet->tablet_id();
context.tablet = _tablet;
context.write_type = DataWriteType::TYPE_DIRECT;
context.mow_context = std::make_shared(_cur_max_version, _req.txn_id, _rowset_ids,
@@ -453,9 +453,24 @@ Status DeltaWriter::close_wait(const PSlaveTabletNodes& slave_tablet_nodes,
RETURN_IF_ERROR(_tablet->calc_delete_bitmap_between_segments(_cur_rowset, segments,
_delete_bitmap));
}
+
+ // commit_phase_update_delete_bitmap() may generate new segments, we need to create a new
+ // transient rowset writer to write the new segments, then merge it back the original
+ // rowset.
+ std::unique_ptr rowset_writer;
+ _tablet->create_transient_rowset_writer(_cur_rowset, &rowset_writer);
RETURN_IF_ERROR(_tablet->commit_phase_update_delete_bitmap(
_cur_rowset, _rowset_ids, _delete_bitmap, segments, _req.txn_id,
- _rowset_writer.get()));
+ rowset_writer.get()));
+ if (_cur_rowset->tablet_schema()->is_partial_update()) {
+ // build rowset writer and merge transient rowset
+ RETURN_IF_ERROR(rowset_writer->flush());
+ RowsetSharedPtr transient_rowset = rowset_writer->build();
+ _cur_rowset->merge_rowset_meta(transient_rowset->rowset_meta());
+
+ // erase segment cache cause we will add a segment to rowset
+ SegmentLoader::instance()->erase_segment(_cur_rowset->rowset_id());
+ }
}
Status res = _storage_engine->txn_manager()->commit_txn(_req.partition_id, _tablet, _req.txn_id,
_req.load_id, _cur_rowset, false);
diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
index 8af69efc66b5708..5f0f99f7ebd91c5 100644
--- a/be/src/olap/in_list_predicate.h
+++ b/be/src/olap/in_list_predicate.h
@@ -346,6 +346,17 @@ class InListPredicateBase : public ColumnPredicate {
}
}
+ bool evaluate_and(const StringRef* dict_words, const size_t count) const override {
+ for (size_t i = 0; i != count; ++i) {
+ const auto found = _values->find(dict_words[i].data, dict_words[i].size) ^ _opposite;
+ if (found == (PT == PredicateType::IN_LIST)) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
bool evaluate_del(const std::pair& statistic) const override {
if (statistic.first->is_null() || statistic.second->is_null()) {
return false;
diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h
index 1ebf306b41f20e3..86f59af14c50b26 100644
--- a/be/src/olap/olap_common.h
+++ b/be/src/olap/olap_common.h
@@ -324,6 +324,7 @@ struct OlapReaderStatistics {
int64_t rows_key_range_filtered = 0;
int64_t rows_stats_filtered = 0;
int64_t rows_bf_filtered = 0;
+ int64_t rows_dict_filtered = 0;
// Including the number of rows filtered out according to the Delete information in the Tablet,
// and the number of rows filtered for marked deleted rows under the unique key model.
// This metric is mainly used to record the number of rows filtered by the delete condition in Segment V1,
diff --git a/be/src/olap/olap_meta.cpp b/be/src/olap/olap_meta.cpp
index 889090d36c53d69..4df89c04dc31f2a 100644
--- a/be/src/olap/olap_meta.cpp
+++ b/be/src/olap/olap_meta.cpp
@@ -19,6 +19,7 @@
#include
#include
+#include
#include
#include
#include
@@ -62,12 +63,24 @@ OlapMeta::OlapMeta(const std::string& root_path) : _root_path(root_path) {}
OlapMeta::~OlapMeta() = default;
+class RocksdbLogger : public rocksdb::Logger {
+public:
+ void Logv(const char* format, va_list ap) override {
+ char buf[1024];
+ vsnprintf(buf, sizeof(buf), format, ap);
+ LOG(INFO) << "[Rocksdb] " << buf;
+ }
+};
+
Status OlapMeta::init() {
// init db
DBOptions options;
options.IncreaseParallelism();
options.create_if_missing = true;
options.create_missing_column_families = true;
+ options.info_log = std::make_shared();
+ options.info_log_level = rocksdb::WARN_LEVEL;
+
std::string db_path = _root_path + META_POSTFIX;
std::vector column_families;
// default column family is required
@@ -76,6 +89,7 @@ Status OlapMeta::init() {
// meta column family add prefix extractor to improve performance and ensure correctness
ColumnFamilyOptions meta_column_family;
+ meta_column_family.max_write_buffer_number = config::rocksdb_max_write_buffer_number;
meta_column_family.prefix_extractor.reset(NewFixedPrefixTransform(PREFIX_LENGTH));
column_families.emplace_back(META_COLUMN_FAMILY, meta_column_family);
diff --git a/be/src/olap/olap_server.cpp b/be/src/olap/olap_server.cpp
index 57bf40a147b1dba..09875b9e5244fa2 100644
--- a/be/src/olap/olap_server.cpp
+++ b/be/src/olap/olap_server.cpp
@@ -1021,9 +1021,8 @@ Status StorageEngine::process_index_change_task(const TAlterInvertedIndexReq& re
return Status::InternalError("tablet not exist, tablet_id={}.", tablet_id);
}
- IndexBuilderSharedPtr index_builder =
- std::make_shared(tablet, request.columns, request.indexes_desc,
- request.alter_inverted_indexes, request.is_drop_op);
+ IndexBuilderSharedPtr index_builder = std::make_shared(
+ tablet, request.columns, request.alter_inverted_indexes, request.is_drop_op);
RETURN_IF_ERROR(_handle_index_change(index_builder));
return Status::OK();
}
diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp
index 7b2f1593ca3d551..6c276dd262be8db 100644
--- a/be/src/olap/primary_key_index.cpp
+++ b/be/src/olap/primary_key_index.cpp
@@ -24,6 +24,7 @@
// IWYU pragma: no_include
#include "common/compiler_util.h" // IWYU pragma: keep
#include "common/config.h"
+#include "io/fs/file_writer.h"
#include "olap/olap_common.h"
#include "olap/rowset/segment_v2/bloom_filter_index_reader.h"
#include "olap/rowset/segment_v2/bloom_filter_index_writer.h"
@@ -71,6 +72,7 @@ Status PrimaryKeyIndexBuilder::add_item(const Slice& key) {
Status PrimaryKeyIndexBuilder::finalize(segment_v2::PrimaryKeyIndexMetaPB* meta) {
// finish primary key index
RETURN_IF_ERROR(_primary_key_index_builder->finish(meta->mutable_primary_key_index()));
+ _disk_size += _primary_key_index_builder->disk_size();
// set min_max key, the sequence column should be removed
meta->set_min_key(min_key().to_string());
@@ -78,7 +80,11 @@ Status PrimaryKeyIndexBuilder::finalize(segment_v2::PrimaryKeyIndexMetaPB* meta)
// finish bloom filter index
RETURN_IF_ERROR(_bloom_filter_index_builder->flush());
- return _bloom_filter_index_builder->finish(_file_writer, meta->mutable_bloom_filter_index());
+ uint64_t start_size = _file_writer->bytes_appended();
+ RETURN_IF_ERROR(
+ _bloom_filter_index_builder->finish(_file_writer, meta->mutable_bloom_filter_index()));
+ _disk_size += _file_writer->bytes_appended() - start_size;
+ return Status::OK();
}
Status PrimaryKeyIndexReader::parse_index(io::FileReaderSPtr file_reader,
diff --git a/be/src/olap/primary_key_index.h b/be/src/olap/primary_key_index.h
index 65cc64f0cd687aa..233644b4e07173f 100644
--- a/be/src/olap/primary_key_index.h
+++ b/be/src/olap/primary_key_index.h
@@ -51,7 +51,11 @@ class PrimaryKeyIndexMetaPB;
class PrimaryKeyIndexBuilder {
public:
PrimaryKeyIndexBuilder(io::FileWriter* file_writer, size_t seq_col_length)
- : _file_writer(file_writer), _num_rows(0), _size(0), _seq_col_length(seq_col_length) {}
+ : _file_writer(file_writer),
+ _num_rows(0),
+ _size(0),
+ _disk_size(0),
+ _seq_col_length(seq_col_length) {}
Status init();
@@ -61,6 +65,8 @@ class PrimaryKeyIndexBuilder {
uint64_t size() const { return _size; }
+ uint64_t disk_size() const { return _disk_size; }
+
Slice min_key() { return Slice(_min_key.data(), _min_key.size() - _seq_col_length); }
Slice max_key() { return Slice(_max_key.data(), _max_key.size() - _seq_col_length); }
@@ -70,6 +76,7 @@ class PrimaryKeyIndexBuilder {
io::FileWriter* _file_writer = nullptr;
uint32_t _num_rows;
uint64_t _size;
+ uint64_t _disk_size;
size_t _seq_col_length;
faststring _min_key;
diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp
index 5564c08740678eb..3ec62de7a879b2e 100644
--- a/be/src/olap/rowset/beta_rowset_reader.cpp
+++ b/be/src/olap/rowset/beta_rowset_reader.cpp
@@ -224,7 +224,7 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context
auto s = seg_ptr->new_iterator(_input_schema, _read_options, &iter);
if (!s.ok()) {
LOG(WARNING) << "failed to create iterator[" << seg_ptr->id() << "]: " << s.to_string();
- return Status::Error();
+ return Status::Error(s.to_string());
}
if (iter->empty()) {
continue;
@@ -268,7 +268,7 @@ Status BetaRowsetReader::init(RowsetReaderContext* read_context,
if (!s.ok()) {
LOG(WARNING) << "failed to init iterator: " << s.to_string();
_iterator.reset();
- return Status::Error();
+ return Status::Error(s.to_string());
}
return Status::OK();
}
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 093eb365b097110..5c3e67e0a233b44 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -1170,23 +1170,8 @@ Status FileColumnIterator::_read_data_page(const OrdinalPageIndexIterator& iter)
auto dict_page_decoder = reinterpret_cast(_page.data_decoder.get());
if (dict_page_decoder->is_dict_encoding()) {
if (_dict_decoder == nullptr) {
- // read dictionary page
- Slice dict_data;
- PageFooterPB dict_footer;
- _opts.type = INDEX_PAGE;
- RETURN_IF_ERROR(_reader->read_page(_opts, _reader->get_dict_page_pointer(),
- &_dict_page_handle, &dict_data, &dict_footer,
- _compress_codec));
- // ignore dict_footer.dict_page_footer().encoding() due to only
- // PLAIN_ENCODING is supported for dict page right now
- _dict_decoder = std::make_unique<
- BinaryPlainPageDecoder>(dict_data);
- RETURN_IF_ERROR(_dict_decoder->init());
-
- auto* pd_decoder = (BinaryPlainPageDecoder*)
- _dict_decoder.get();
- _dict_word_info.reset(new StringRef[pd_decoder->_num_elems]);
- pd_decoder->get_dict_word_info(_dict_word_info.get());
+ RETURN_IF_ERROR(_read_dict_data());
+ CHECK_NOTNULL(_dict_decoder);
}
dict_page_decoder->set_dict_decoder(_dict_decoder.get(), _dict_word_info.get());
@@ -1195,6 +1180,27 @@ Status FileColumnIterator::_read_data_page(const OrdinalPageIndexIterator& iter)
return Status::OK();
}
+Status FileColumnIterator::_read_dict_data() {
+ CHECK_EQ(_reader->encoding_info()->encoding(), DICT_ENCODING);
+ // read dictionary page
+ Slice dict_data;
+ PageFooterPB dict_footer;
+ _opts.type = INDEX_PAGE;
+ RETURN_IF_ERROR(_reader->read_page(_opts, _reader->get_dict_page_pointer(), &_dict_page_handle,
+ &dict_data, &dict_footer, _compress_codec));
+ // ignore dict_footer.dict_page_footer().encoding() due to only
+ // PLAIN_ENCODING is supported for dict page right now
+ _dict_decoder =
+ std::make_unique>(dict_data);
+ RETURN_IF_ERROR(_dict_decoder->init());
+
+ auto* pd_decoder =
+ (BinaryPlainPageDecoder*)_dict_decoder.get();
+ _dict_word_info.reset(new StringRef[pd_decoder->_num_elems]);
+ pd_decoder->get_dict_word_info(_dict_word_info.get());
+ return Status::OK();
+}
+
Status FileColumnIterator::get_row_ranges_by_zone_map(
const AndBlockColumnPredicate* col_predicates,
const std::vector* delete_predicates, RowRanges* row_ranges) {
@@ -1213,6 +1219,23 @@ Status FileColumnIterator::get_row_ranges_by_bloom_filter(
return Status::OK();
}
+Status FileColumnIterator::get_row_ranges_by_dict(const AndBlockColumnPredicate* col_predicates,
+ RowRanges* row_ranges) {
+ if (!_is_all_dict_encoding) {
+ return Status::OK();
+ }
+
+ if (!_dict_decoder) {
+ RETURN_IF_ERROR(_read_dict_data());
+ CHECK_NOTNULL(_dict_decoder);
+ }
+
+ if (!col_predicates->evaluate_and(_dict_word_info.get(), _dict_decoder->count())) {
+ row_ranges->clear();
+ }
+ return Status::OK();
+}
+
Status DefaultValueColumnIterator::init(const ColumnIteratorOptions& opts) {
_opts = opts;
// be consistent with segment v1
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h
index bee0bcfb915d0d0..6cb9794b3b64b68 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -302,6 +302,11 @@ class ColumnIterator {
return Status::OK();
}
+ virtual Status get_row_ranges_by_dict(const AndBlockColumnPredicate* col_predicates,
+ RowRanges* row_ranges) {
+ return Status::OK();
+ }
+
virtual bool is_all_dict_encoding() const { return false; }
protected:
@@ -342,6 +347,9 @@ class FileColumnIterator final : public ColumnIterator {
Status get_row_ranges_by_bloom_filter(const AndBlockColumnPredicate* col_predicates,
RowRanges* row_ranges) override;
+ Status get_row_ranges_by_dict(const AndBlockColumnPredicate* col_predicates,
+ RowRanges* row_ranges) override;
+
ParsedPage* get_current_page() { return &_page; }
bool is_nullable() { return _reader->is_nullable(); }
@@ -352,8 +360,8 @@ class FileColumnIterator final : public ColumnIterator {
void _seek_to_pos_in_page(ParsedPage* page, ordinal_t offset_in_page) const;
Status _load_next_page(bool* eos);
Status _read_data_page(const OrdinalPageIndexIterator& iter);
+ Status _read_dict_data();
-private:
ColumnReader* _reader;
// iterator owned compress codec, should NOT be shared by threads, initialized in init()
diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp
index 28a44b7b757606e..acbbfd09346561f 100644
--- a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp
@@ -23,6 +23,7 @@
#include
#include "common/logging.h"
+#include "io/fs/file_writer.h"
#include "olap/key_coder.h"
#include "olap/olap_common.h"
#include "olap/rowset/segment_v2/encoding_info.h"
@@ -45,6 +46,7 @@ IndexedColumnWriter::IndexedColumnWriter(const IndexedColumnWriterOptions& optio
_file_writer(file_writer),
_num_values(0),
_num_data_pages(0),
+ _disk_size(0),
_value_key_coder(nullptr),
_compress_codec(nullptr) {
_first_value.resize(_type_info->size());
@@ -116,10 +118,12 @@ Status IndexedColumnWriter::_finish_current_data_page(size_t& num_val) {
footer.mutable_data_page_footer()->set_num_values(num_values_in_page);
footer.mutable_data_page_footer()->set_nullmap_size(0);
+ uint64_t start_size = _file_writer->bytes_appended();
RETURN_IF_ERROR(PageIO::compress_and_write_page(
_compress_codec, _options.compression_min_space_saving, _file_writer,
{page_body.slice()}, footer, &_last_data_page));
_num_data_pages++;
+ _disk_size += (_file_writer->bytes_appended() - start_size);
if (_options.write_ordinal_index) {
std::string key;
@@ -171,9 +175,11 @@ Status IndexedColumnWriter::_flush_index(IndexPageBuilder* index_builder, BTreeM
index_builder->finish(&page_body, &page_footer);
PagePointer pp;
+ uint64_t start_size = _file_writer->bytes_appended();
RETURN_IF_ERROR(PageIO::compress_and_write_page(
_compress_codec, _options.compression_min_space_saving, _file_writer,
{page_body.slice()}, page_footer, &pp));
+ _disk_size += (_file_writer->bytes_appended() - start_size);
meta->set_is_root_data_page(false);
pp.to_proto(meta->mutable_root_page());
diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.h b/be/src/olap/rowset/segment_v2/indexed_column_writer.h
index a95a9fce7f76419..ba61708dd909362 100644
--- a/be/src/olap/rowset/segment_v2/indexed_column_writer.h
+++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.h
@@ -83,6 +83,8 @@ class IndexedColumnWriter {
Status finish(IndexedColumnMetaPB* meta);
+ uint64_t disk_size() const { return _disk_size; }
+
private:
Status _finish_current_data_page(size_t& num_val);
@@ -96,6 +98,7 @@ class IndexedColumnWriter {
ordinal_t _num_values;
uint32_t _num_data_pages;
+ uint64_t _disk_size;
// remember the first value in current page
faststring _first_value;
PagePointer _last_data_page;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compound_directory.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compound_directory.cpp
index be19be580a520b8..4574d64abf92ecb 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_compound_directory.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_compound_directory.cpp
@@ -467,7 +467,6 @@ void DorisCompoundDirectory::init(const io::FileSystemSPtr& _fs, const char* _pa
if (lock_factory == nullptr) {
lock_factory = _CLNEW lucene::store::NoLockFactory();
- fs->create_directory(directory);
}
setLockFactory(lock_factory);
@@ -476,17 +475,21 @@ void DorisCompoundDirectory::init(const io::FileSystemSPtr& _fs, const char* _pa
lockFactory->setLockPrefix(nullptr);
}
+ // It's fail checking directory existence in S3.
+ if (fs->type() == io::FileSystemType::S3) {
+ return;
+ }
bool exists = false;
Status status = fs->exists(directory, &exists);
if (!status.ok()) {
auto err = "File system error: " + status.to_string();
LOG(WARNING) << err;
- _CLTHROWA_DEL(CL_ERR_IO, err.c_str());
+ _CLTHROWA(CL_ERR_IO, err.c_str());
}
if (!exists) {
auto e = "Doris compound directory init error: " + directory + " is not a directory";
LOG(WARNING) << e;
- _CLTHROWA_DEL(CL_ERR_IO, e.c_str());
+ _CLTHROWA(CL_ERR_IO, e.c_str());
}
}
@@ -579,7 +582,7 @@ DorisCompoundDirectory* DorisCompoundDirectory::getDirectory(
bool exists = false;
_fs->exists(file, &exists);
if (!exists) {
- mkdir(file, 0777);
+ _fs->create_directory(file);
}
dir = _CLNEW DorisCompoundDirectory();
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index ab5d3548df44b49..fcf125b2fa8f2de 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -409,7 +409,9 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
FINALLY_FINALIZE_OUTPUT(meta_out)
FINALLY_FINALIZE_OUTPUT(data_out)
FINALLY_FINALIZE_OUTPUT(index_out)
- FINALLY_FINALIZE_OUTPUT(dir)
+ if constexpr (field_is_numeric_type(field_type)) {
+ FINALLY_FINALIZE_OUTPUT(dir)
+ }
LOG(WARNING) << "Inverted index writer finish error occurred: " << e.what();
return Status::Error(
"Inverted index writer finish error occurred");
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index c15371368068fba..eaa3102d0b04f6b 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -374,6 +374,7 @@ Status SegmentIterator::_prepare_seek(const StorageReadOptions::KeyRange& key_ra
&_column_iterators[unique_id]));
ColumnIteratorOptions iter_opts;
iter_opts.stats = _opts.stats;
+ iter_opts.use_page_cache = _opts.use_page_cache;
iter_opts.file_reader = _file_reader.get();
iter_opts.io_ctx = _opts.io_ctx;
RETURN_IF_ERROR(_column_iterators[unique_id]->init(iter_opts));
@@ -490,6 +491,26 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
RowRanges::ranges_intersection(*condition_row_ranges, zone_map_row_ranges,
condition_row_ranges);
_opts.stats->rows_stats_filtered += (pre_size - condition_row_ranges->count());
+
+ /// Low cardinality optimization is currently not very stable, so to prevent data corruption,
+ /// we are temporarily disabling its use in data compaction.
+ if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) {
+ RowRanges dict_row_ranges = RowRanges::create_single(num_rows());
+ for (auto cid : cids) {
+ RowRanges tmp_row_ranges = RowRanges::create_single(num_rows());
+ DCHECK(_opts.col_id_to_predicates.count(cid) > 0);
+ uint32_t unique_cid = _schema->unique_id(cid);
+ RETURN_IF_ERROR(_column_iterators[unique_cid]->get_row_ranges_by_dict(
+ _opts.col_id_to_predicates.at(cid).get(), &tmp_row_ranges));
+ RowRanges::ranges_intersection(dict_row_ranges, tmp_row_ranges, &dict_row_ranges);
+ }
+
+ pre_size = condition_row_ranges->count();
+ RowRanges::ranges_intersection(*condition_row_ranges, dict_row_ranges,
+ condition_row_ranges);
+ _opts.stats->rows_dict_filtered += (pre_size - condition_row_ranges->count());
+ }
+
return Status::OK();
}
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp
index 561da9c3c6c1819..d339b324a76aabc 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp
@@ -330,7 +330,10 @@ void SegmentWriter::_serialize_block_to_row_column(vectorized::Block& block) {
Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* block,
size_t row_pos, size_t num_rows) {
CHECK(block->columns() > _tablet_schema->num_key_columns() &&
- block->columns() < _tablet_schema->num_columns());
+ block->columns() < _tablet_schema->num_columns())
+ << "block columns: " << block->columns()
+ << ", num key columns: " << _tablet_schema->num_key_columns()
+ << ", total schema columns: " << _tablet_schema->num_columns();
CHECK(_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write);
// find missing column cids
@@ -365,12 +368,12 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block*
bool has_default = false;
std::vector use_default_flag;
use_default_flag.reserve(num_rows);
- std::unordered_map segment_caches;
std::vector specified_rowsets;
{
std::shared_lock rlock(_tablet->get_header_lock());
specified_rowsets = _tablet->get_rowset_by_ids(&_mow_context->rowset_ids);
}
+ std::vector> segment_caches(specified_rowsets.size());
// locate rows in base data
{
for (size_t pos = row_pos; pos < num_rows; pos++) {
@@ -764,10 +767,13 @@ Status SegmentWriter::finalize_columns_index(uint64_t* index_size) {
if (_has_key) {
if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) {
RETURN_IF_ERROR(_write_primary_key_index());
+ // IndexedColumnWriter write data pages mixed with segment data, we should use
+ // the stat from primary key index builder.
+ *index_size += _primary_key_index_builder->disk_size();
} else {
RETURN_IF_ERROR(_write_short_key_index());
+ *index_size = _file_writer->bytes_appended() - index_start;
}
- *index_size = _file_writer->bytes_appended() - index_start;
}
_inverted_index_file_size = try_get_inverted_index_file_size();
// reset all column writers and data_conveter
diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp
index 98f7e8e53527ce7..05d9c869c3ab671 100644
--- a/be/src/olap/schema_change.cpp
+++ b/be/src/olap/schema_change.cpp
@@ -288,19 +288,7 @@ Status BlockChanger::change_block(vectorized::Block* ref_block,
for (int idx = 0; idx < column_size; idx++) {
int ref_idx = _schema_mapping[idx].ref_column;
- if (ref_idx < 0 && _type != ROLLUP) {
- // new column, write default value
- auto value = _schema_mapping[idx].default_value;
- auto column = new_block->get_by_position(idx).column->assume_mutable();
- if (value->is_null()) {
- DCHECK(column->is_nullable());
- column->insert_many_defaults(row_size);
- } else {
- auto type_info = get_type_info(_schema_mapping[idx].new_column);
- DefaultValueColumnIterator::insert_default_data(type_info.get(), value->size(),
- value->ptr(), column, row_size);
- }
- } else if (_schema_mapping[idx].expr != nullptr) {
+ if (_schema_mapping[idx].expr != nullptr) {
vectorized::VExprContextSPtr ctx;
RETURN_IF_ERROR(vectorized::VExpr::create_expr_tree(*_schema_mapping[idx].expr, ctx));
RETURN_IF_ERROR(ctx->prepare(state, row_desc));
@@ -322,6 +310,24 @@ Status BlockChanger::change_block(vectorized::Block* ref_block,
ref_block->get_by_position(result_column_id).column));
}
swap_idx_map[result_column_id] = idx;
+ } else if (ref_idx < 0) {
+ if (_type != ROLLUP) {
+ // new column, write default value
+ auto value = _schema_mapping[idx].default_value;
+ auto column = new_block->get_by_position(idx).column->assume_mutable();
+ if (value->is_null()) {
+ DCHECK(column->is_nullable());
+ column->insert_many_defaults(row_size);
+ } else {
+ auto type_info = get_type_info(_schema_mapping[idx].new_column);
+ DefaultValueColumnIterator::insert_default_data(type_info.get(), value->size(),
+ value->ptr(), column, row_size);
+ }
+ } else {
+ return Status::Error(
+ "rollup job meet invalid ref_column, new_column={}",
+ _schema_mapping[idx].new_column->name());
+ }
} else {
// same type, just swap column
swap_idx_map[ref_idx] = idx;
diff --git a/be/src/olap/snapshot_manager.cpp b/be/src/olap/snapshot_manager.cpp
index d19af18cca227d7..c9bc76ae6825875 100644
--- a/be/src/olap/snapshot_manager.cpp
+++ b/be/src/olap/snapshot_manager.cpp
@@ -603,7 +603,7 @@ Status SnapshotManager::_create_snapshot_files(const TabletSharedPtr& ref_tablet
break;
}
- } while (0);
+ } while (false);
if (!res.ok()) {
LOG(WARNING) << "fail to make snapshot, try to delete the snapshot path. path="
diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp
index d10c11b1c27c4ae..60b0b023c6609f7 100644
--- a/be/src/olap/storage_engine.cpp
+++ b/be/src/olap/storage_engine.cpp
@@ -776,7 +776,11 @@ void StorageEngine::gc_binlogs(const std::unordered_map& gc_ta
LOG(INFO) << fmt::format("start to gc binlogs for tablet_id: {}, version: {}", tablet_id,
version);
- TabletSharedPtr tablet = StorageEngine::instance()->tablet_manager()->get_tablet(tablet_id);
+ TabletSharedPtr tablet = _tablet_manager->get_tablet(tablet_id);
+ if (tablet == nullptr) {
+ LOG(WARNING) << fmt::format("tablet_id: {} not found", tablet_id);
+ continue;
+ }
tablet->gc_binlogs(version);
}
}
diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp
index 2babfed8a053615..2d4964323513a1a 100644
--- a/be/src/olap/tablet.cpp
+++ b/be/src/olap/tablet.cpp
@@ -1902,6 +1902,28 @@ Status Tablet::create_rowset_writer(RowsetWriterContext& context,
return RowsetFactory::create_rowset_writer(context, false, rowset_writer);
}
+// create a rowset writer with rowset_id and seg_id
+// after writer, merge this transient rowset with original rowset
+Status Tablet::create_transient_rowset_writer(RowsetSharedPtr rowset_ptr,
+ std::unique_ptr* rowset_writer) {
+ RowsetWriterContext context;
+ context.rowset_state = PREPARED;
+ context.segments_overlap = OVERLAPPING;
+ context.tablet_schema = std::make_shared();
+ context.tablet_schema->copy_from(*(rowset_ptr->tablet_schema()));
+ context.tablet_schema->set_partial_update_info(false, std::set());
+ context.newest_write_timestamp = UnixSeconds();
+ context.tablet_id = table_id();
+ // ATTN: context.tablet is a shared_ptr, can't simply set it's value to `this`. We should
+ // get the shared_ptr from tablet_manager.
+ context.tablet = StorageEngine::instance()->tablet_manager()->get_tablet(tablet_id());
+ context.write_type = DataWriteType::TYPE_DIRECT;
+ RETURN_IF_ERROR(
+ create_transient_rowset_writer(context, rowset_ptr->rowset_id(), rowset_writer));
+ (*rowset_writer)->set_segment_start_id(rowset_ptr->num_segments());
+ return Status::OK();
+}
+
Status Tablet::create_transient_rowset_writer(RowsetWriterContext& context,
const RowsetId& rowset_id,
std::unique_ptr* rowset_writer) {
@@ -2699,12 +2721,11 @@ Status Tablet::lookup_row_data(const Slice& encoded_key, const RowLocation& row_
return Status::OK();
}
-Status Tablet::lookup_row_key(
- const Slice& encoded_key, bool with_seq_col,
- const std::vector& specified_rowsets, RowLocation* row_location,
- uint32_t version,
- std::unordered_map& segment_caches,
- RowsetSharedPtr* rowset) {
+Status Tablet::lookup_row_key(const Slice& encoded_key, bool with_seq_col,
+ const std::vector& specified_rowsets,
+ RowLocation* row_location, uint32_t version,
+ std::vector>& segment_caches,
+ RowsetSharedPtr* rowset) {
SCOPED_BVAR_LATENCY(g_tablet_lookup_rowkey_latency);
size_t seq_col_length = 0;
if (_schema->has_sequence_col() && with_seq_col) {
@@ -2713,7 +2734,8 @@ Status Tablet::lookup_row_key(
Slice key_without_seq = Slice(encoded_key.get_data(), encoded_key.get_size() - seq_col_length);
RowLocation loc;
- for (auto& rs : specified_rowsets) {
+ for (size_t i = 0; i < specified_rowsets.size(); i++) {
+ auto& rs = specified_rowsets[i];
auto& segments_key_bounds = rs->rowset_meta()->get_segments_key_bounds();
int num_segments = rs->num_segments();
DCHECK_EQ(segments_key_bounds.size(), num_segments);
@@ -2729,14 +2751,12 @@ Status Tablet::lookup_row_key(
continue;
}
- auto iter = segment_caches.find(rs->rowset_id());
- if (iter == segment_caches.end()) {
- SegmentCacheHandle segment_cache_handle;
+ if (UNLIKELY(segment_caches[i] == nullptr)) {
+ segment_caches[i] = std::make_unique();
RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
- std::static_pointer_cast(rs), &segment_cache_handle, true));
- iter = segment_caches.emplace(rs->rowset_id(), std::move(segment_cache_handle)).first;
+ std::static_pointer_cast(rs), segment_caches[i].get(), true));
}
- auto& segments = iter->second.get_segments();
+ auto& segments = segment_caches[i]->get_segments();
DCHECK_EQ(segments.size(), num_segments);
for (auto id : picked_segments) {
@@ -2845,7 +2865,7 @@ Status Tablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset,
// The data for each segment may be lookup multiple times. Creating a SegmentCacheHandle
// will update the lru cache, and there will be obvious lock competition in multithreading
// scenarios, so using a segment_caches to cache SegmentCacheHandle.
- std::unordered_map segment_caches;
+ std::vector> segment_caches(specified_rowsets.size());
while (remaining > 0) {
std::unique_ptr iter;
RETURN_IF_ERROR(pk_idx->new_iterator(&iter));
diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h
index 7cccb03a313b8ca..724ab812430170f 100644
--- a/be/src/olap/tablet.h
+++ b/be/src/olap/tablet.h
@@ -328,6 +328,9 @@ class Tablet : public BaseTablet {
Status create_rowset_writer(RowsetWriterContext& context,
std::unique_ptr* rowset_writer);
+
+ Status create_transient_rowset_writer(RowsetSharedPtr rowset_ptr,
+ std::unique_ptr* rowset_writer);
Status create_transient_rowset_writer(RowsetWriterContext& context, const RowsetId& rowset_id,
std::unique_ptr* rowset_writer);
@@ -399,12 +402,11 @@ class Tablet : public BaseTablet {
// Lookup the row location of `encoded_key`, the function sets `row_location` on success.
// NOTE: the method only works in unique key model with primary key index, you will got a
// not supported error in other data model.
- Status lookup_row_key(
- const Slice& encoded_key, bool with_seq_col,
- const std::vector& specified_rowsets, RowLocation* row_location,
- uint32_t version,
- std::unordered_map& segment_caches,
- RowsetSharedPtr* rowset = nullptr);
+ Status lookup_row_key(const Slice& encoded_key, bool with_seq_col,
+ const std::vector& specified_rowsets,
+ RowLocation* row_location, uint32_t version,
+ std::vector>& segment_caches,
+ RowsetSharedPtr* rowset = nullptr);
// Lookup a row with TupleDescriptor and fill Block
Status lookup_row_data(const Slice& encoded_key, const RowLocation& row_location,
diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp
index f5f62e68578761f..414036ab58864c2 100644
--- a/be/src/olap/tablet_schema.cpp
+++ b/be/src/olap/tablet_schema.cpp
@@ -637,6 +637,17 @@ void TabletSchema::append_index(TabletIndex index) {
_indexes.push_back(std::move(index));
}
+void TabletSchema::remove_index(int64_t index_id) {
+ std::vector indexes;
+ for (auto index : _indexes) {
+ if (index.index_id() == index_id) {
+ continue;
+ }
+ indexes.emplace_back(std::move(index));
+ }
+ _indexes = std::move(indexes);
+}
+
void TabletSchema::clear_columns() {
_field_name_to_index.clear();
_field_id_to_index.clear();
diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h
index ceadf76a0af6416..5a521f36107ef3f 100644
--- a/be/src/olap/tablet_schema.h
+++ b/be/src/olap/tablet_schema.h
@@ -207,6 +207,7 @@ class TabletSchema {
void to_schema_pb(TabletSchemaPB* tablet_meta_pb) const;
void append_column(TabletColumn column, bool is_dropped_column = false);
void append_index(TabletIndex index);
+ void remove_index(int64_t index_id);
// Must make sure the row column is always the last column
void add_row_column();
void copy_from(const TabletSchema& tablet_schema);
diff --git a/be/src/olap/task/engine_publish_version_task.cpp b/be/src/olap/task/engine_publish_version_task.cpp
index 02af6bf674d17b4..249a7a424c8decf 100644
--- a/be/src/olap/task/engine_publish_version_task.cpp
+++ b/be/src/olap/task/engine_publish_version_task.cpp
@@ -72,8 +72,7 @@ EnginePublishVersionTask::EnginePublishVersionTask(
const TPublishVersionRequest& publish_version_req, std::vector* error_tablet_ids,
std::vector* succ_tablet_ids,
std::vector>* discontinuous_version_tablets)
- : _total_task_num(0),
- _publish_version_req(publish_version_req),
+ : _publish_version_req(publish_version_req),
_error_tablet_ids(error_tablet_ids),
_succ_tablet_ids(succ_tablet_ids),
_discontinuous_version_tablets(discontinuous_version_tablets) {}
@@ -88,25 +87,14 @@ void EnginePublishVersionTask::add_succ_tablet_id(int64_t tablet_id) {
_succ_tablet_ids->push_back(tablet_id);
}
-void EnginePublishVersionTask::wait() {
- std::unique_lock lock(_tablet_finish_mutex);
- _tablet_finish_cond.wait(lock);
-}
-
-void EnginePublishVersionTask::notify() {
- std::unique_lock lock(_tablet_finish_mutex);
- _tablet_finish_cond.notify_one();
-}
-
-int64_t EnginePublishVersionTask::finish_task() {
- return _total_task_num.fetch_sub(1);
-}
-
Status EnginePublishVersionTask::finish() {
Status res = Status::OK();
int64_t transaction_id = _publish_version_req.transaction_id;
OlapStopWatch watch;
VLOG_NOTICE << "begin to process publish version. transaction_id=" << transaction_id;
+ std::unique_ptr token =
+ StorageEngine::instance()->tablet_publish_txn_thread_pool()->new_token(
+ ThreadPool::ExecutionMode::CONCURRENT);
// each partition
for (auto& par_ver_info : _publish_version_req.partition_version_infos) {
@@ -187,19 +175,13 @@ Status EnginePublishVersionTask::finish() {
continue;
}
}
- _total_task_num.fetch_add(1);
auto tablet_publish_txn_ptr = std::make_shared(
this, tablet, rowset, partition_id, transaction_id, version, tablet_info);
- auto submit_st =
- StorageEngine::instance()->tablet_publish_txn_thread_pool()->submit_func(
- [=]() { tablet_publish_txn_ptr->handle(); });
+ auto submit_st = token->submit_func([=]() { tablet_publish_txn_ptr->handle(); });
CHECK(submit_st.ok()) << submit_st;
}
}
- // wait for all publish txn finished
- while (_total_task_num.load() != 0) {
- wait();
- }
+ token->wait();
// check if the related tablet remained all have the version
for (auto& par_ver_info : _publish_version_req.partition_version_infos) {
@@ -260,12 +242,7 @@ void TabletPublishTxnTask::handle() {
_engine_publish_version_task->add_error_tablet_id(_tablet_info.tablet_id);
return;
}
- Defer defer {[&] {
- _rowset->finish_publish();
- if (_engine_publish_version_task->finish_task() == 1) {
- _engine_publish_version_task->notify();
- }
- }};
+ Defer defer {[&] { _rowset->finish_publish(); }};
auto publish_status = StorageEngine::instance()->txn_manager()->publish_txn(
_partition_id, _tablet, _transaction_id, _version, &_stats);
if (publish_status != Status::OK()) {
diff --git a/be/src/olap/task/engine_publish_version_task.h b/be/src/olap/task/engine_publish_version_task.h
index c8a68dedea3a9d9..8acf8099ca244f8 100644
--- a/be/src/olap/task/engine_publish_version_task.h
+++ b/be/src/olap/task/engine_publish_version_task.h
@@ -93,21 +93,14 @@ class EnginePublishVersionTask : public EngineTask {
void add_error_tablet_id(int64_t tablet_id);
void add_succ_tablet_id(int64_t tablet_id);
- void notify();
- void wait();
-
int64_t finish_task();
private:
- std::atomic _total_task_num;
const TPublishVersionRequest& _publish_version_req;
std::mutex _tablet_ids_mutex;
vector* _error_tablet_ids;
vector* _succ_tablet_ids;
std::vector>* _discontinuous_version_tablets;
-
- std::mutex _tablet_finish_mutex;
- std::condition_variable _tablet_finish_cond;
};
class AsyncTabletPublishTask {
diff --git a/be/src/olap/task/index_builder.cpp b/be/src/olap/task/index_builder.cpp
index 05ba099f90e6e59..4717dff9b3638ad 100644
--- a/be/src/olap/task/index_builder.cpp
+++ b/be/src/olap/task/index_builder.cpp
@@ -30,12 +30,10 @@
namespace doris {
IndexBuilder::IndexBuilder(const TabletSharedPtr& tablet, const std::vector& columns,
- const std::vector exist_indexes,
const std::vector& alter_inverted_indexes,
bool is_drop_op)
: _tablet(tablet),
_columns(columns),
- _exist_indexes(exist_indexes),
_alter_inverted_indexes(alter_inverted_indexes),
_is_drop_op(is_drop_op) {
_olap_data_convertor = std::make_unique();
@@ -63,8 +61,16 @@ Status IndexBuilder::update_inverted_index_info() {
auto input_rs_tablet_schema = input_rowset->tablet_schema();
output_rs_tablet_schema->copy_from(*input_rs_tablet_schema);
if (_is_drop_op) {
- output_rs_tablet_schema->update_indexes_from_thrift(_exist_indexes);
+ // base on input rowset's tablet_schema to build
+ // output rowset's tablet_schema which only remove
+ // the indexes specified in this drop index request
+ for (auto t_inverted_index : _alter_inverted_indexes) {
+ output_rs_tablet_schema->remove_index(t_inverted_index.index_id);
+ }
} else {
+ // base on input rowset's tablet_schema to build
+ // output rowset's tablet_schema which only add
+ // the indexes specified in this build index request
for (auto t_inverted_index : _alter_inverted_indexes) {
TabletIndex index;
index.init_from_thrift(t_inverted_index, *input_rs_tablet_schema);
@@ -183,7 +189,7 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta
if (!res.ok()) {
LOG(WARNING) << "failed to create iterator[" << seg_ptr->id()
<< "]: " << res.to_string();
- return Status::Error();
+ return Status::Error(res.to_string());
}
std::shared_ptr block = std::make_shared(
@@ -427,6 +433,13 @@ Status IndexBuilder::do_build_inverted_index() {
}
Status IndexBuilder::modify_rowsets(const Merger::Statistics* stats) {
+ for (auto rowset_ptr : _output_rowsets) {
+ auto rowset_id = rowset_ptr->rowset_id();
+ if (StorageEngine::instance()->check_rowset_id_in_unused_rowsets(rowset_id)) {
+ DCHECK(false) << "output rowset: " << rowset_id.to_string() << " in unused rowsets";
+ }
+ }
+
if (_tablet->keys_type() == KeysType::UNIQUE_KEYS &&
_tablet->enable_unique_key_merge_on_write()) {
std::lock_guard rwlock(_tablet->get_rowset_update_lock());
diff --git a/be/src/olap/task/index_builder.h b/be/src/olap/task/index_builder.h
index 562cb1148d8a48e..9e406c22c12a223 100644
--- a/be/src/olap/task/index_builder.h
+++ b/be/src/olap/task/index_builder.h
@@ -36,7 +36,6 @@ using RowsetWriterUniquePtr = std::unique_ptr;
class IndexBuilder {
public:
IndexBuilder(const TabletSharedPtr& tablet, const std::vector& columns,
- const std::vector exist_indexes,
const std::vector& alter_inverted_indexes,
bool is_drop_op = false);
~IndexBuilder();
@@ -65,7 +64,6 @@ class IndexBuilder {
private:
TabletSharedPtr _tablet;
std::vector _columns;
- std::vector _exist_indexes;
std::vector _alter_inverted_indexes;
bool _is_drop_op;
std::unordered_map> _rowset_alter_index_column_ids;
diff --git a/be/src/olap/txn_manager.cpp b/be/src/olap/txn_manager.cpp
index ad799868aa39090..ebcb2873893610b 100644
--- a/be/src/olap/txn_manager.cpp
+++ b/be/src/olap/txn_manager.cpp
@@ -372,7 +372,7 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id,
// update delete_bitmap
if (tablet_txn_info.unique_key_merge_on_write) {
std::unique_ptr rowset_writer;
- _create_transient_rowset_writer(tablet, rowset, &rowset_writer);
+ tablet->create_transient_rowset_writer(rowset, &rowset_writer);
int64_t t2 = MonotonicMicros();
RETURN_IF_ERROR(tablet->update_delete_bitmap(rowset, tablet_txn_info.rowset_ids,
@@ -450,27 +450,6 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id,
return status;
}
-// create a rowset writer with rowset_id and seg_id
-// after writer, merge this transient rowset with original rowset
-Status TxnManager::_create_transient_rowset_writer(std::shared_ptr tablet,
- RowsetSharedPtr rowset_ptr,
- std::unique_ptr* rowset_writer) {
- RowsetWriterContext context;
- context.rowset_state = PREPARED;
- context.segments_overlap = OVERLAPPING;
- context.tablet_schema = std::make_shared();
- context.tablet_schema->copy_from(*(rowset_ptr->tablet_schema()));
- context.tablet_schema->set_partial_update_info(false, std::set());
- context.newest_write_timestamp = UnixSeconds();
- context.tablet_id = tablet->table_id();
- context.tablet = tablet;
- context.write_type = DataWriteType::TYPE_DIRECT;
- RETURN_IF_ERROR(tablet->create_transient_rowset_writer(context, rowset_ptr->rowset_id(),
- rowset_writer));
- (*rowset_writer)->set_segment_start_id(rowset_ptr->num_segments());
- return Status::OK();
-}
-
// txn could be rollbacked if it does not have related rowset
// if the txn has related rowset then could not rollback it, because it
// may be committed in another thread and our current thread meets errors when writing to data file
diff --git a/be/src/olap/txn_manager.h b/be/src/olap/txn_manager.h
index 36be3b03f54729b..fcebe2d9d1cfcf9 100644
--- a/be/src/olap/txn_manager.h
+++ b/be/src/olap/txn_manager.h
@@ -214,10 +214,6 @@ class TxnManager {
void _insert_txn_partition_map_unlocked(int64_t transaction_id, int64_t partition_id);
void _clear_txn_partition_map_unlocked(int64_t transaction_id, int64_t partition_id);
- Status _create_transient_rowset_writer(std::shared_ptr tablet,
- RowsetSharedPtr rowset_ptr,
- std::unique_ptr* rowset_writer);
-
private:
const int32_t _txn_map_shard_size;
diff --git a/be/src/pipeline/exec/exchange_sink_buffer.cpp b/be/src/pipeline/exec/exchange_sink_buffer.cpp
index e8b3f76fda5e1fb..0326929e5ca0116 100644
--- a/be/src/pipeline/exec/exchange_sink_buffer.cpp
+++ b/be/src/pipeline/exec/exchange_sink_buffer.cpp
@@ -62,6 +62,8 @@ void ExchangeSinkBuffer::close() {
pair.second->release_finst_id();
pair.second->release_query_id();
}
+ _instance_to_broadcast_package_queue.clear();
+ _instance_to_package_queue.clear();
_instance_to_request.clear();
}
@@ -146,7 +148,7 @@ Status ExchangeSinkBuffer::add_block(BroadcastTransmitInfo&& request) {
send_now = true;
_instance_to_sending_by_pipeline[ins_id.lo] = false;
}
- _instance_to_broadcast_package_queue[ins_id.lo].emplace(std::move(request));
+ _instance_to_broadcast_package_queue[ins_id.lo].emplace(request);
}
if (send_now) {
RETURN_IF_ERROR(_send_rpc(ins_id.lo));
@@ -158,6 +160,8 @@ Status ExchangeSinkBuffer::add_block(BroadcastTransmitInfo&& request) {
Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) {
std::unique_lock lock(*_instance_to_package_queue_mutex[id]);
+ DCHECK(_instance_to_sending_by_pipeline[id] == false);
+
std::queue>& q = _instance_to_package_queue[id];
std::queue>& broadcast_q =
_instance_to_broadcast_package_queue[id];
@@ -257,7 +261,6 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) {
broadcast_q.pop();
} else {
_instance_to_sending_by_pipeline[id] = true;
- return Status::OK();
}
return Status::OK();
diff --git a/be/src/runtime/descriptors.cpp b/be/src/runtime/descriptors.cpp
index a75d3d0e71011b5..1855f5d58d79ce1 100644
--- a/be/src/runtime/descriptors.cpp
+++ b/be/src/runtime/descriptors.cpp
@@ -36,7 +36,6 @@
#include "vec/data_types/data_type_factory.hpp"
namespace doris {
-using boost::algorithm::join;
const int RowDescriptor::INVALID_IDX = -1;
std::string NullIndicatorOffset::debug_string() const {
@@ -112,7 +111,7 @@ vectorized::DataTypePtr SlotDescriptor::get_data_type_ptr() const {
std::string SlotDescriptor::debug_string() const {
std::stringstream out;
out << "Slot(id=" << _id << " type=" << _type << " col=" << _col_pos
- << ", colname=" << _col_name << " null=" << _null_indicator_offset.debug_string() << ")";
+ << ", colname=" << _col_name << ", nullable=" << is_nullable() << ")";
return out.str();
}
@@ -140,7 +139,7 @@ std::string OlapTableDescriptor::debug_string() const {
SchemaTableDescriptor::SchemaTableDescriptor(const TTableDescriptor& tdesc)
: TableDescriptor(tdesc), _schema_table_type(tdesc.schemaTable.tableType) {}
-SchemaTableDescriptor::~SchemaTableDescriptor() {}
+SchemaTableDescriptor::~SchemaTableDescriptor() = default;
std::string SchemaTableDescriptor::debug_string() const {
std::stringstream out;
@@ -151,7 +150,7 @@ std::string SchemaTableDescriptor::debug_string() const {
BrokerTableDescriptor::BrokerTableDescriptor(const TTableDescriptor& tdesc)
: TableDescriptor(tdesc) {}
-BrokerTableDescriptor::~BrokerTableDescriptor() {}
+BrokerTableDescriptor::~BrokerTableDescriptor() = default;
std::string BrokerTableDescriptor::debug_string() const {
std::stringstream out;
@@ -161,7 +160,7 @@ std::string BrokerTableDescriptor::debug_string() const {
HiveTableDescriptor::HiveTableDescriptor(const TTableDescriptor& tdesc) : TableDescriptor(tdesc) {}
-HiveTableDescriptor::~HiveTableDescriptor() {}
+HiveTableDescriptor::~HiveTableDescriptor() = default;
std::string HiveTableDescriptor::debug_string() const {
std::stringstream out;
@@ -172,7 +171,7 @@ std::string HiveTableDescriptor::debug_string() const {
IcebergTableDescriptor::IcebergTableDescriptor(const TTableDescriptor& tdesc)
: TableDescriptor(tdesc) {}
-IcebergTableDescriptor::~IcebergTableDescriptor() {}
+IcebergTableDescriptor::~IcebergTableDescriptor() = default;
std::string IcebergTableDescriptor::debug_string() const {
std::stringstream out;
@@ -189,7 +188,7 @@ MaxComputeTableDescriptor::MaxComputeTableDescriptor(const TTableDescriptor& tde
_secret_key(tdesc.mcTable.secret_key),
_public_access(tdesc.mcTable.public_access) {}
-MaxComputeTableDescriptor::~MaxComputeTableDescriptor() {}
+MaxComputeTableDescriptor::~MaxComputeTableDescriptor() = default;
std::string MaxComputeTableDescriptor::debug_string() const {
std::stringstream out;
@@ -199,7 +198,7 @@ std::string MaxComputeTableDescriptor::debug_string() const {
EsTableDescriptor::EsTableDescriptor(const TTableDescriptor& tdesc) : TableDescriptor(tdesc) {}
-EsTableDescriptor::~EsTableDescriptor() {}
+EsTableDescriptor::~EsTableDescriptor() = default;
std::string EsTableDescriptor::debug_string() const {
std::stringstream out;
@@ -272,7 +271,6 @@ TupleDescriptor::TupleDescriptor(const TTupleDescriptor& tdesc, bool own_slots)
_table_desc(nullptr),
_num_null_bytes(tdesc.numNullBytes),
_num_materialized_slots(0),
- _slots(),
_has_varlen_slots(false),
_own_slots(own_slots) {
if (false == tdesc.__isset.numNullSlots) {
@@ -288,7 +286,6 @@ TupleDescriptor::TupleDescriptor(const PTupleDescriptor& pdesc, bool own_slots)
_table_desc(nullptr),
_num_null_bytes(pdesc.num_null_bytes()),
_num_materialized_slots(0),
- _slots(),
_has_varlen_slots(false),
_own_slots(own_slots) {
if (!pdesc.has_num_null_slots()) {
diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h
index 04e6571ef5c31b8..aff3b03a0f7011b 100644
--- a/be/src/runtime/descriptors.h
+++ b/be/src/runtime/descriptors.h
@@ -90,7 +90,6 @@ class SlotDescriptor {
int col_pos() const { return _col_pos; }
// Returns the field index in the generated llvm struct for this slot's tuple
int field_idx() const { return _field_idx; }
- const NullIndicatorOffset& null_indicator_offset() const { return _null_indicator_offset; }
bool is_materialized() const { return _is_materialized; }
bool is_nullable() const { return _null_indicator_offset.bit_mask != 0; }
diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp
index 8875cf5463431be..a699dc402f08675 100644
--- a/be/src/runtime/exec_env_init.cpp
+++ b/be/src/runtime/exec_env_init.cpp
@@ -55,7 +55,6 @@
#include "runtime/heartbeat_flags.h"
#include "runtime/load_channel_mgr.h"
#include "runtime/load_path_mgr.h"
-#include "runtime/memory/chunk_allocator.h"
#include "runtime/memory/mem_tracker.h"
#include "runtime/memory/mem_tracker_limiter.h"
#include "runtime/memory/thread_mem_tracker_mgr.h"
@@ -314,23 +313,6 @@ Status ExecEnv::_init_mem_env() {
// 4. init other managers
RETURN_IF_ERROR(_block_spill_mgr->init());
-
- // 5. init chunk allocator
- if (!BitUtil::IsPowerOf2(config::min_chunk_reserved_bytes)) {
- ss << "Config min_chunk_reserved_bytes must be a power-of-two: "
- << config::min_chunk_reserved_bytes;
- return Status::InternalError(ss.str());
- }
-
- int64_t chunk_reserved_bytes_limit =
- ParseUtil::parse_mem_spec(config::chunk_reserved_bytes_limit, MemInfo::mem_limit(),
- MemInfo::physical_mem(), &is_percent);
- chunk_reserved_bytes_limit =
- BitUtil::RoundDown(chunk_reserved_bytes_limit, config::min_chunk_reserved_bytes);
- ChunkAllocator::init_instance(chunk_reserved_bytes_limit);
- LOG(INFO) << "Chunk allocator memory limit: "
- << PrettyPrinter::print(chunk_reserved_bytes_limit, TUnit::BYTES)
- << ", origin config value: " << config::chunk_reserved_bytes_limit;
return Status::OK();
}
diff --git a/be/src/runtime/memory/chunk_allocator.cpp b/be/src/runtime/memory/chunk_allocator.cpp
deleted file mode 100644
index 53c51660750c53b..000000000000000
--- a/be/src/runtime/memory/chunk_allocator.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/memory/chunk_allocator.h"
-
-#include
-#include
-#include
-
-#include
-
-#include "common/config.h"
-#include "common/status.h"
-#include "runtime/memory/chunk.h"
-#include "runtime/memory/system_allocator.h"
-#include "runtime/thread_context.h"
-#include "util/bit_util.h"
-#include "util/cpu_info.h"
-#include "util/doris_metrics.h"
-#include "util/metrics.h"
-#include "util/runtime_profile.h"
-#include "util/spinlock.h"
-
-namespace doris {
-
-// <= MIN_CHUNK_SIZE, A large number of small chunks will waste extra storage and increase lock time.
-static constexpr size_t MIN_CHUNK_SIZE = 4096; // 4K
-// >= MAX_CHUNK_SIZE, Large chunks may not be used for a long time, wasting memory.
-static constexpr size_t MAX_CHUNK_SIZE = 64 * (1ULL << 20); // 64M
-
-ChunkAllocator* ChunkAllocator::_s_instance = nullptr;
-
-DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(chunk_pool_local_core_alloc_count, MetricUnit::NOUNIT);
-DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(chunk_pool_other_core_alloc_count, MetricUnit::NOUNIT);
-DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(chunk_pool_system_alloc_count, MetricUnit::NOUNIT);
-DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(chunk_pool_system_free_count, MetricUnit::NOUNIT);
-DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(chunk_pool_system_alloc_cost_ns, MetricUnit::NANOSECONDS);
-DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(chunk_pool_system_free_cost_ns, MetricUnit::NANOSECONDS);
-DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(chunk_pool_reserved_bytes, MetricUnit::NOUNIT);
-
-static IntCounter* chunk_pool_local_core_alloc_count;
-static IntCounter* chunk_pool_other_core_alloc_count;
-static IntCounter* chunk_pool_system_alloc_count;
-static IntCounter* chunk_pool_system_free_count;
-static IntCounter* chunk_pool_system_alloc_cost_ns;
-static IntCounter* chunk_pool_system_free_cost_ns;
-static IntGauge* chunk_pool_reserved_bytes;
-
-#ifdef BE_TEST
-static std::mutex s_mutex;
-ChunkAllocator* ChunkAllocator::instance() {
- std::lock_guard l(s_mutex);
- if (_s_instance == nullptr) {
- CpuInfo::init();
- ChunkAllocator::init_instance(4096);
- }
- return _s_instance;
-}
-#endif
-
-// Keep free chunk's ptr in size separated free list.
-// This class is thread-safe.
-class ChunkArena {
- int TRY_LOCK_TIMES = 3;
-
-public:
- ChunkArena() : _chunk_lists(64) {}
-
- ~ChunkArena() {
- for (int i = 0; i < 64; ++i) {
- if (_chunk_lists[i].empty()) continue;
- for (auto ptr : _chunk_lists[i]) {
- SystemAllocator::free(ptr);
- }
- }
- }
-
- // Try to pop a free chunk from corresponding free list.
- // Return true if success
- bool pop_free_chunk(size_t size, uint8_t** ptr) {
- int idx = BitUtil::Log2Ceiling64(size);
- auto& free_list = _chunk_lists[idx];
-
- if (free_list.empty()) return false;
-
- for (int i = 0; i < TRY_LOCK_TIMES; ++i) {
- if (_lock.try_lock()) {
- if (free_list.empty()) {
- _lock.unlock();
- return false;
- } else {
- *ptr = free_list.back();
- free_list.pop_back();
- ASAN_UNPOISON_MEMORY_REGION(*ptr, size);
- _lock.unlock();
- return true;
- }
- }
- }
- return false;
- }
-
- void push_free_chunk(uint8_t* ptr, size_t size) {
- int idx = BitUtil::Log2Ceiling64(size);
- // Poison this chunk to make asan can detect invalid access
- ASAN_POISON_MEMORY_REGION(ptr, size);
- std::lock_guard l(_lock);
- _chunk_lists[idx].push_back(ptr);
- }
-
- void clear() {
- std::lock_guard l(_lock);
- for (int i = 0; i < 64; ++i) {
- if (_chunk_lists[i].empty()) {
- continue;
- }
- for (auto ptr : _chunk_lists[i]) {
- ::free(ptr);
- }
- std::vector().swap(_chunk_lists[i]);
- }
- }
-
-private:
- SpinLock _lock;
- std::vector> _chunk_lists;
-};
-
-void ChunkAllocator::init_instance(size_t reserve_limit) {
- if (_s_instance != nullptr) return;
- _s_instance = new ChunkAllocator(reserve_limit);
-}
-
-ChunkAllocator::ChunkAllocator(size_t reserve_limit)
- : _reserve_bytes_limit(reserve_limit),
- _steal_arena_limit(reserve_limit * 0.1),
- _reserved_bytes(0),
- _arenas(CpuInfo::get_max_num_cores()) {
- _mem_tracker =
- std::make_unique(MemTrackerLimiter::Type::GLOBAL, "ChunkAllocator");
- for (int i = 0; i < _arenas.size(); ++i) {
- _arenas[i].reset(new ChunkArena());
- }
-
- _chunk_allocator_metric_entity =
- DorisMetrics::instance()->metric_registry()->register_entity("chunk_allocator");
- INT_COUNTER_METRIC_REGISTER(_chunk_allocator_metric_entity, chunk_pool_local_core_alloc_count);
- INT_COUNTER_METRIC_REGISTER(_chunk_allocator_metric_entity, chunk_pool_other_core_alloc_count);
- INT_COUNTER_METRIC_REGISTER(_chunk_allocator_metric_entity, chunk_pool_system_alloc_count);
- INT_COUNTER_METRIC_REGISTER(_chunk_allocator_metric_entity, chunk_pool_system_free_count);
- INT_COUNTER_METRIC_REGISTER(_chunk_allocator_metric_entity, chunk_pool_system_alloc_cost_ns);
- INT_COUNTER_METRIC_REGISTER(_chunk_allocator_metric_entity, chunk_pool_system_free_cost_ns);
- INT_GAUGE_METRIC_REGISTER(_chunk_allocator_metric_entity, chunk_pool_reserved_bytes);
-}
-
-Status ChunkAllocator::allocate_align(size_t size, Chunk* chunk) {
- CHECK(size > 0);
- size = BitUtil::RoundUpToPowerOfTwo(size);
- // fast path: allocate from current core arena
- int core_id = CpuInfo::get_current_core();
- chunk->size = size;
- chunk->core_id = core_id;
-
- if (_reserve_bytes_limit < 1) {
- // allocate from system allocator
- chunk->data = SystemAllocator::allocate(size);
- return Status::OK();
- }
-
- if (_arenas[core_id]->pop_free_chunk(size, &chunk->data)) {
- DCHECK_GE(_reserved_bytes, 0);
- _reserved_bytes.fetch_sub(size);
- chunk_pool_local_core_alloc_count->increment(1);
- // transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
- THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
- return Status::OK();
- }
- // Second path: try to allocate from other core's arena
- // When the reserved bytes is greater than the limit, the chunk is stolen from other arena.
- // Otherwise, it is allocated from the system first, which can reserve enough memory as soon as possible.
- // After that, allocate from current core arena as much as possible.
- if (_reserved_bytes > _steal_arena_limit) {
- ++core_id;
- for (int i = 1; i < _arenas.size(); ++i, ++core_id) {
- if (_arenas[core_id % _arenas.size()]->pop_free_chunk(size, &chunk->data)) {
- DCHECK_GE(_reserved_bytes, 0);
- _reserved_bytes.fetch_sub(size);
- chunk_pool_other_core_alloc_count->increment(1);
- // reset chunk's core_id to other
- chunk->core_id = core_id % _arenas.size();
- // transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
- THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
- return Status::OK();
- }
- }
- }
-
- int64_t cost_ns = 0;
- {
- SCOPED_RAW_TIMER(&cost_ns);
- // allocate from system allocator
- chunk->data = SystemAllocator::allocate(size);
- }
- chunk_pool_system_alloc_count->increment(1);
- chunk_pool_system_alloc_cost_ns->increment(cost_ns);
- if (chunk->data == nullptr) {
- return Status::MemoryAllocFailed("ChunkAllocator failed to allocate chunk {} bytes", size);
- }
- return Status::OK();
-}
-
-void ChunkAllocator::free(const Chunk& chunk) {
- DCHECK(chunk.core_id != -1);
- CHECK((chunk.size & (chunk.size - 1)) == 0);
- if (config::disable_mem_pools || _reserve_bytes_limit < 1) {
- SystemAllocator::free(chunk.data);
- return;
- }
-
- int64_t old_reserved_bytes = _reserved_bytes;
- int64_t new_reserved_bytes = 0;
- do {
- new_reserved_bytes = old_reserved_bytes + chunk.size;
- if (chunk.size <= MIN_CHUNK_SIZE || chunk.size >= MAX_CHUNK_SIZE ||
- new_reserved_bytes > _reserve_bytes_limit) {
- int64_t cost_ns = 0;
- {
- SCOPED_RAW_TIMER(&cost_ns);
- SystemAllocator::free(chunk.data);
- }
- chunk_pool_system_free_count->increment(1);
- chunk_pool_system_free_cost_ns->increment(cost_ns);
-
- return;
- }
- } while (!_reserved_bytes.compare_exchange_weak(old_reserved_bytes, new_reserved_bytes));
-
- // The memory size of allocate/free is a multiple of 2, so `_reserved_bytes% 100 == 32`
- // will definitely happen, and the latest `_reserved_bytes` value will be set every time.
- // The real-time and accurate `_reserved_bytes` value is not required. Usually,
- // the value of `_reserved_bytes` is equal to ChunkAllocator MemTracker.
- // The `_reserved_bytes` metric is only concerned when verifying the accuracy of MemTracker.
- // Therefore, reduce the number of sets and reduce the performance impact.
- if (_reserved_bytes % 100 == 32) {
- chunk_pool_reserved_bytes->set_value(_reserved_bytes);
- }
- // The chunk's memory ownership is transferred from tls tracker to ChunkAllocator.
- THREAD_MEM_TRACKER_TRANSFER_TO(chunk.size, _mem_tracker.get());
- _arenas[chunk.core_id]->push_free_chunk(chunk.data, chunk.size);
-}
-
-void ChunkAllocator::free(uint8_t* data, size_t size) {
- Chunk chunk;
- chunk.data = data;
- chunk.size = size;
- chunk.core_id = CpuInfo::get_current_core();
- free(chunk);
-}
-
-void ChunkAllocator::clear() {
- for (int i = 0; i < _arenas.size(); ++i) {
- _arenas[i]->clear();
- }
- THREAD_MEM_TRACKER_TRANSFER_FROM(_mem_tracker->consumption(), _mem_tracker.get());
-}
-
-} // namespace doris
diff --git a/be/src/runtime/memory/chunk_allocator.h b/be/src/runtime/memory/chunk_allocator.h
deleted file mode 100644
index 467317c19176f82..000000000000000
--- a/be/src/runtime/memory/chunk_allocator.h
+++ /dev/null
@@ -1,99 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include
-#include
-#include
-#include
-#include
-
-#include "runtime/memory/mem_tracker_limiter.h"
-
-namespace doris {
-
-struct Chunk;
-class ChunkArena;
-class MetricEntity;
-class Status;
-
-// Used to allocate memory with power-of-two length.
-// This Allocator allocate memory from system and cache free chunks for
-// later use.
-//
-// ChunkAllocator has one ChunkArena for each CPU core, it will try to allocate
-// memory from current core arena firstly. In this way, there will be no lock contention
-// between concurrently-running threads. If this fails, ChunkAllocator will try to allocate
-// memory from other core's arena.
-//
-// Memory Reservation
-// ChunkAllocator has a limit about how much free chunk bytes it can reserve, above which
-// chunk will released to system memory. For the worst case, when the limits is 0, it will
-// act as allocating directly from system.
-//
-// ChunkArena will keep a separate free list for each chunk size. In common case, chunk will
-// be allocated from current core arena. In this case, there is no lock contention.
-//
-// Must call CpuInfo::init() and DorisMetrics::instance()->initialize() to achieve good performance
-// before first object is created. And call init_instance() before use instance is called.
-class ChunkAllocator {
-public:
- static void init_instance(size_t reserve_limit);
-
-#ifdef BE_TEST
- static ChunkAllocator* instance();
-#else
- static ChunkAllocator* instance() { return _s_instance; }
-#endif
-
- // Up size to 2^n length, allocate a chunk.
- Status allocate_align(size_t size, Chunk* chunk);
-
- // Free chunk allocated from this allocator
- void free(const Chunk& chunk);
-
- // Transfer the memory ownership to the chunk allocator.
- // If the chunk allocator is full, then free to the system.
- // Note: make sure that the length of 'data' is equal to size,
- // otherwise the capacity of chunk allocator will be wrong.
- void free(uint8_t* data, size_t size);
-
- void clear();
-
- int64_t mem_consumption() { return _reserved_bytes; }
-
-private:
- ChunkAllocator(size_t reserve_limit);
-
-private:
- static ChunkAllocator* _s_instance;
-
- size_t _reserve_bytes_limit;
- // When the reserved chunk memory size is greater than the limit,
- // it is allowed to steal the chunks of other arenas.
- size_t _steal_arena_limit;
- std::atomic _reserved_bytes;
- // each core has a ChunkArena
- std::vector> _arenas;
-
- std::shared_ptr _chunk_allocator_metric_entity;
-
- std::unique_ptr _mem_tracker;
-};
-
-} // namespace doris
diff --git a/be/src/runtime/memory/mem_tracker_limiter.cpp b/be/src/runtime/memory/mem_tracker_limiter.cpp
index d03bd1ac005f12d..683971ecac4765d 100644
--- a/be/src/runtime/memory/mem_tracker_limiter.cpp
+++ b/be/src/runtime/memory/mem_tracker_limiter.cpp
@@ -137,7 +137,7 @@ void MemTrackerLimiter::make_process_snapshots(std::vector
process_mem_sum += it.second->current_value();
}
- snapshot.type = "tc/jemalloc_cache";
+ snapshot.type = "tc/jemalloc_free_memory";
snapshot.label = "";
snapshot.limit = -1;
snapshot.cur_consumption = MemInfo::allocator_cache_mem();
diff --git a/be/src/runtime/memory/system_allocator.cpp b/be/src/runtime/memory/system_allocator.cpp
deleted file mode 100644
index 6acec3b104359a8..000000000000000
--- a/be/src/runtime/memory/system_allocator.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/memory/system_allocator.h"
-
-#include
-#include
-#include
-
-#include
-#include
-
-#if !defined(__APPLE__) || !defined(_POSIX_C_SOURCE)
-#include
-#else
-#include
-#endif
-
-#include "common/logging.h"
-#include "runtime/memory/mem_tracker_limiter.h"
-#include "runtime/thread_context.h"
-#include "util/sse_util.hpp"
-
-namespace {
-
-int get_page_size() {
-#if !defined(__APPLE__) || !defined(_POSIX_C_SOURCE)
- return getpagesize();
-#else
- return vm_page_size;
-#endif
-}
-
-} // namespace
-
-namespace doris {
-
-uint8_t* SystemAllocator::allocate(size_t length) {
- return allocate_via_malloc(length);
-}
-
-void SystemAllocator::free(uint8_t* ptr) {
- ::free(ptr);
-}
-
-uint8_t* SystemAllocator::allocate_via_malloc(size_t length) {
- void* ptr = nullptr;
- // try to use a whole page instead of parts of one page
- int res = posix_memalign(&ptr, get_page_size(), length);
- if (res != 0) {
- char buf[64];
- auto err = fmt::format("fail to allocate mem via posix_memalign, res={}, errmsg={}.", res,
- strerror_r(res, buf, 64));
- LOG(ERROR) << err;
- if (enable_thread_catch_bad_alloc) throw std::bad_alloc {};
- MemTrackerLimiter::print_log_process_usage(err);
- return nullptr;
- }
- return (uint8_t*)ptr;
-}
-
-} // namespace doris
diff --git a/be/src/runtime/memory/thread_mem_tracker_mgr.h b/be/src/runtime/memory/thread_mem_tracker_mgr.h
index dd7b3b8b32aa197..deafcdc241733d3 100644
--- a/be/src/runtime/memory/thread_mem_tracker_mgr.h
+++ b/be/src/runtime/memory/thread_mem_tracker_mgr.h
@@ -173,7 +173,7 @@ inline void ThreadMemTrackerMgr::consume(int64_t size) {
}
// Large memory alloc should use allocator.h
// Direct malloc or new large memory, unable to catch std::bad_alloc, BE may OOM.
- if (size > 4294967296) { // 4G
+ if (size > 1024l * 1024 * 1024) { // 1G
_stop_consume = true;
LOG(WARNING) << fmt::format("MemHook alloc large memory: {}, stacktrace:\n{}", size,
get_stack_trace());
diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp
index a422a81bea98348..66bc4a5743a6e44 100644
--- a/be/src/runtime/runtime_state.cpp
+++ b/be/src/runtime/runtime_state.cpp
@@ -300,17 +300,11 @@ Status RuntimeState::check_query_state(const std::string& msg) {
return query_status();
}
-const std::string ERROR_FILE_NAME = "error_log";
const int64_t MAX_ERROR_NUM = 50;
Status RuntimeState::create_error_log_file() {
_exec_env->load_path_mgr()->get_load_error_file_name(
_db_name, _import_label, _fragment_instance_id, &_error_log_file_path);
- // std::stringstream ss;
- // ss << load_dir() << "/" << ERROR_FILE_NAME
- // << "_" << std::hex << fragment_instance_id().hi
- // << "_" << fragment_instance_id().lo;
- // _error_log_file_path = ss.str();
std::string error_log_absolute_path =
_exec_env->load_path_mgr()->get_load_error_absolute_path(_error_log_file_path);
_error_log_file = new std::ofstream(error_log_absolute_path, std::ifstream::out);
diff --git a/be/src/runtime/user_function_cache.cpp b/be/src/runtime/user_function_cache.cpp
index 25e7405a0fee7df..f7ec0890a6427e3 100644
--- a/be/src/runtime/user_function_cache.cpp
+++ b/be/src/runtime/user_function_cache.cpp
@@ -140,8 +140,9 @@ Status UserFunctionCache::_load_entry_from_lib(const std::string& dir, const std
}
std::vector split_parts = strings::Split(file, ".");
- if (split_parts.size() != 3) {
- return Status::InternalError("user function's name should be function_id.checksum.so");
+ if (split_parts.size() != 3 && split_parts.size() != 4) {
+ return Status::InternalError(
+ "user function's name should be function_id.checksum[.file_name].file_type");
}
int64_t function_id = std::stol(split_parts[0]);
std::string checksum = split_parts[1];
@@ -176,7 +177,7 @@ Status UserFunctionCache::_load_cached_lib() {
auto st = _load_entry_from_lib(sub_dir, file.file_name);
if (!st.ok()) {
LOG(WARNING) << "load a library failed, dir=" << sub_dir
- << ", file=" << file.file_name;
+ << ", file=" << file.file_name << ": " << st.to_string();
}
return true;
};
diff --git a/be/src/service/backend_service.cpp b/be/src/service/backend_service.cpp
index 1b4a1ec9446bee4..76722ba858d9d67 100644
--- a/be/src/service/backend_service.cpp
+++ b/be/src/service/backend_service.cpp
@@ -384,6 +384,7 @@ void BackendService::check_storage_format(TCheckStorageFormatResult& result) {
void BackendService::ingest_binlog(TIngestBinlogResult& result,
const TIngestBinlogRequest& request) {
+ constexpr uint64_t kMaxTimeoutMs = 1000;
TStatus tstatus;
Defer defer {[&result, &tstatus]() { result.__set_status(tstatus); }};
@@ -485,7 +486,7 @@ void BackendService::ingest_binlog(TIngestBinlogResult& result,
std::string binlog_info;
auto get_binlog_info_cb = [&get_binlog_info_url, &binlog_info](HttpClient* client) {
RETURN_IF_ERROR(client->init(get_binlog_info_url));
- client->set_timeout_ms(10); // 10ms
+ client->set_timeout_ms(kMaxTimeoutMs);
return client->execute(&binlog_info);
};
status = HttpClient::execute_with_retry(max_retry, 1, get_binlog_info_cb);
@@ -509,7 +510,7 @@ void BackendService::ingest_binlog(TIngestBinlogResult& result,
std::string rowset_meta_str;
auto get_rowset_meta_cb = [&get_rowset_meta_url, &rowset_meta_str](HttpClient* client) {
RETURN_IF_ERROR(client->init(get_rowset_meta_url));
- client->set_timeout_ms(10); // 10ms
+ client->set_timeout_ms(kMaxTimeoutMs);
return client->execute(&rowset_meta_str);
};
status = HttpClient::execute_with_retry(max_retry, 1, get_rowset_meta_cb);
@@ -528,7 +529,7 @@ void BackendService::ingest_binlog(TIngestBinlogResult& result,
}
// rewrite rowset meta
rowset_meta_pb.set_tablet_id(local_tablet_id);
- rowset_meta_pb.set_partition_id(local_tablet->tablet_meta()->partition_id());
+ rowset_meta_pb.set_partition_id(partition_id);
rowset_meta_pb.set_tablet_schema_hash(local_tablet->tablet_meta()->schema_hash());
rowset_meta_pb.set_txn_id(txn_id);
rowset_meta_pb.set_rowset_state(RowsetStatePB::COMMITTED);
@@ -556,7 +557,7 @@ void BackendService::ingest_binlog(TIngestBinlogResult& result,
auto get_segment_file_size_cb = [&get_segment_file_size_url,
&segment_file_size](HttpClient* client) {
RETURN_IF_ERROR(client->init(get_segment_file_size_url));
- client->set_timeout_ms(10); // 10ms
+ client->set_timeout_ms(kMaxTimeoutMs);
RETURN_IF_ERROR(client->head());
return client->get_content_length(&segment_file_size);
};
@@ -600,7 +601,7 @@ void BackendService::ingest_binlog(TIngestBinlogResult& result,
auto get_segment_file_cb = [&get_segment_file_url, &local_segment_path, segment_file_size,
estimate_timeout](HttpClient* client) {
RETURN_IF_ERROR(client->init(get_segment_file_url));
- client->set_timeout_ms(estimate_timeout * 1000); // 10ms
+ client->set_timeout_ms(estimate_timeout * 1000);
RETURN_IF_ERROR(client->download(local_segment_path));
std::error_code ec;
diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp
index 699f5989676dc86..246032ef9ac9533 100644
--- a/be/src/service/doris_main.cpp
+++ b/be/src/service/doris_main.cpp
@@ -54,6 +54,7 @@
#include "common/signal_handler.h"
#include "common/status.h"
#include "io/cache/block/block_file_cache_factory.h"
+#include "io/fs/s3_file_write_bufferpool.h"
#include "olap/options.h"
#include "olap/storage_engine.h"
#include "runtime/exec_env.h"
@@ -432,6 +433,12 @@ int main(int argc, char** argv) {
doris::ExecEnv::init(exec_env, paths);
doris::TabletSchemaCache::create_global_schema_cache();
+ // init s3 write buffer pool
+ doris::io::S3FileBufferPool* s3_buffer_pool = doris::io::S3FileBufferPool::GetInstance();
+ s3_buffer_pool->init(doris::config::s3_write_buffer_whole_size,
+ doris::config::s3_write_buffer_size,
+ exec_env->buffered_reader_prefetch_thread_pool());
+
// init and open storage engine
doris::EngineOptions options;
options.store_paths = paths;
diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp
index c0c6d4e4ea1ac64..e1ab48d2d884a9f 100644
--- a/be/src/service/internal_service.cpp
+++ b/be/src/service/internal_service.cpp
@@ -114,6 +114,7 @@
#include "vec/exec/format/json/new_json_reader.h"
#include "vec/exec/format/orc/vorc_reader.h"
#include "vec/exec/format/parquet/vparquet_reader.h"
+#include "vec/exec/scan/avro_jni_reader.h"
#include "vec/jsonb/serialize.h"
#include "vec/runtime/vdata_stream_mgr.h"
@@ -588,6 +589,14 @@ void PInternalServiceImpl::fetch_table_schema(google::protobuf::RpcController* c
file_slots, &io_ctx);
break;
}
+ case TFileFormatType::FORMAT_AVRO: {
+ // file_slots is no use
+ std::vector file_slots;
+ reader = vectorized::AvroJNIReader::create_unique(profile.get(), params, range,
+ file_slots);
+ ((vectorized::AvroJNIReader*)(reader.get()))->init_fetch_table_schema_reader();
+ break;
+ }
default:
st = Status::InternalError("Not supported file format in fetch table schema: {}",
params.format_type);
diff --git a/be/src/service/point_query_executor.cpp b/be/src/service/point_query_executor.cpp
index 255d4c9a2c8650d..c6052c6a909af2d 100644
--- a/be/src/service/point_query_executor.cpp
+++ b/be/src/service/point_query_executor.cpp
@@ -245,12 +245,12 @@ Status PointQueryExecutor::_lookup_row_key() {
SCOPED_TIMER(&_profile_metrics.lookup_key_ns);
// 2. lookup row location
Status st;
- std::unordered_map segment_caches;
std::vector specified_rowsets;
{
std::shared_lock rlock(_tablet->get_header_lock());
specified_rowsets = _tablet->get_rowset_by_ids(nullptr);
}
+ std::vector> segment_caches(specified_rowsets.size());
for (size_t i = 0; i < _row_read_ctxs.size(); ++i) {
RowLocation location;
if (!config::disable_storage_row_cache) {
diff --git a/be/src/util/faststring.cc b/be/src/util/faststring.cc
index 8eb3b6c021edc6f..cf373efec4b568d 100644
--- a/be/src/util/faststring.cc
+++ b/be/src/util/faststring.cc
@@ -43,7 +43,7 @@ void faststring::GrowArray(size_t newcapacity) {
}
capacity_ = newcapacity;
if (data_ != initial_data_) {
- Allocator::free_no_munmap(data_);
+ Allocator::free(data_);
} else {
ASAN_POISON_MEMORY_REGION(initial_data_, arraysize(initial_data_));
}
@@ -57,13 +57,13 @@ void faststring::ShrinkToFitInternal() {
if (len_ <= kInitialCapacity) {
ASAN_UNPOISON_MEMORY_REGION(initial_data_, len_);
memcpy(initial_data_, &data_[0], len_);
- Allocator::free_no_munmap(data_);
+ Allocator::free(data_);
data_ = initial_data_;
capacity_ = kInitialCapacity;
} else {
std::unique_ptr newdata(reinterpret_cast(Allocator::alloc(len_)));
memcpy(&newdata[0], &data_[0], len_);
- Allocator::free_no_munmap(data_);
+ Allocator::free(data_);
data_ = newdata.release();
capacity_ = len_;
}
diff --git a/be/src/util/faststring.h b/be/src/util/faststring.h
index c353ec8d55fa70b..09f325d1114b2fb 100644
--- a/be/src/util/faststring.h
+++ b/be/src/util/faststring.h
@@ -54,7 +54,7 @@ class faststring : private Allocator {
~faststring() {
ASAN_UNPOISON_MEMORY_REGION(initial_data_, arraysize(initial_data_));
if (data_ != initial_data_) {
- Allocator::free_no_munmap(data_);
+ Allocator::free(data_);
}
}
diff --git a/be/src/util/jsonb_document.h b/be/src/util/jsonb_document.h
index b18d3d47e006b3b..d54e7291dd4ab68 100644
--- a/be/src/util/jsonb_document.h
+++ b/be/src/util/jsonb_document.h
@@ -72,6 +72,7 @@
#include
#include
+#include
#include
#include
#include
@@ -144,6 +145,8 @@ constexpr char WILDCARD = '*';
constexpr char MINUS = '-';
constexpr char LAST[] = "last";
constexpr char ESCAPE = '\\';
+constexpr unsigned int MEMBER_CODE = 0;
+constexpr unsigned int ARRAY_CODE = 1;
/*
* JsonbDocument is the main object that accesses and queries JSONB packed
@@ -276,35 +279,43 @@ class Stream {
skip_whitespace();
}
- void clear_legPtr() { legPtr = nullptr; }
+ void clear_leg_ptr() { leg_ptr = nullptr; }
- void set_legPtr(char* ptr) {
- clear_legPtr();
- legPtr = ptr;
+ void set_leg_ptr(char* ptr) {
+ clear_leg_ptr();
+ leg_ptr = ptr;
}
- char* get_legPtr() { return legPtr; }
+ char* get_leg_ptr() { return leg_ptr; }
- void clear_legLen() { legLen = 0; }
+ void clear_leg_len() { leg_len = 0; }
- void add_legLen() { legLen++; }
+ void add_leg_len() { leg_len++; }
- unsigned int get_legLen() { return legLen; }
+ unsigned int get_leg_len() const { return leg_len; }
void remove_escapes() {
int new_len = 0;
- for (int i = 0; i < legLen; i++) {
- if (legPtr[i] != '\\') {
- legPtr[new_len++] = legPtr[i];
+ for (int i = 0; i < leg_len; i++) {
+ if (leg_ptr[i] != '\\') {
+ leg_ptr[new_len++] = leg_ptr[i];
}
}
- legPtr[new_len] = '\0';
- legLen = new_len;
+ leg_ptr[new_len] = '\0';
+ leg_len = new_len;
}
- void set_hasEscapes(bool has) { hasEscapes = has; }
+ void set_has_escapes(bool has) { has_escapes = has; }
- bool get_hasEscapes() { return hasEscapes; }
+ bool get_has_escapes() const { return has_escapes; }
+
+ void set_is_invalid_json_path(bool has) { is_invalid_json_path = has; }
+
+ bool get_is_invalid_json_path() const { return is_invalid_json_path; }
+
+ void set_type(unsigned int code) { type = code; }
+
+ bool get_type() const { return type; }
private:
/// The current position in the stream.
@@ -314,19 +325,25 @@ class Stream {
const char* const m_end;
///path leg ptr
- char* legPtr;
+ char* leg_ptr;
///path leg len
- unsigned int legLen;
+ unsigned int leg_len;
+
+ ///Whether to contain escape characters
+ bool has_escapes = false;
- ///
- bool hasEscapes = false;
+ ///Is the json path valid
+ bool is_invalid_json_path = false;
+
+ ///type: 0 is member 1 is array
+ unsigned int type;
};
class JsonbPath {
public:
// parse json path
- static bool parsePath(Stream* stream, JsonbValue* value);
+ static bool parsePath(Stream* stream);
static bool parse_array(Stream* stream);
static bool parse_member(Stream* stream);
@@ -513,12 +530,14 @@ class JsonbValue {
const char* getValuePtr() const;
// find the JSONB value by a key path string (null terminated)
- JsonbValue* findPath(const char* key_path, hDictFind handler = nullptr) {
- return findPath(key_path, (unsigned int)strlen(key_path), handler);
+ JsonbValue* findPath(const char* key_path, bool& is_invalid_json_path,
+ hDictFind handler = nullptr) {
+ return findPath(key_path, (unsigned int)strlen(key_path), is_invalid_json_path, handler);
}
// find the JSONB value by a key path string (with length)
- JsonbValue* findPath(const char* key_path, unsigned int len, hDictFind handler);
+ JsonbValue* findPath(const char* key_path, unsigned int len, bool& is_invalid_json_path,
+ hDictFind handler);
friend class JsonbDocument;
protected:
@@ -1189,82 +1208,144 @@ inline const char* JsonbValue::getValuePtr() const {
}
inline JsonbValue* JsonbValue::findPath(const char* key_path, unsigned int kp_len,
- hDictFind handler = nullptr) {
+ bool& is_invalid_json_path, hDictFind handler = nullptr) {
if (!key_path) return nullptr;
- if (kp_len == 0) return this;
+ if (kp_len == 0) {
+ is_invalid_json_path = true;
+ return nullptr;
+ }
Stream stream(key_path, kp_len);
stream.skip_whitespace();
- if (stream.exhausted() || stream.read() != SCOPE) return nullptr;
+ if (stream.exhausted() || stream.read() != SCOPE) {
+ is_invalid_json_path = true;
+ return nullptr;
+ }
JsonbValue* pval = this;
while (pval && !stream.exhausted()) {
stream.skip_whitespace();
- stream.clear_legPtr();
- stream.clear_legLen();
+ stream.clear_leg_ptr();
+ stream.clear_leg_len();
- if (!JsonbPath::parsePath(&stream, pval)) {
+ if (!JsonbPath::parsePath(&stream)) {
+ is_invalid_json_path = stream.get_is_invalid_json_path();
return nullptr;
}
- if (stream.get_legLen() == 0) {
+ if (stream.get_leg_len() == 0) {
return nullptr;
}
- if (LIKELY(pval->type_ == JsonbType::T_Object)) {
- if (stream.get_legLen() == 1 && *stream.get_legPtr() == WILDCARD) {
- return pval;
- } else if (stream.get_hasEscapes()) {
- stream.remove_escapes();
- }
+ if (stream.get_type() == MEMBER_CODE) {
+ if (LIKELY(pval->type_ == JsonbType::T_Object)) {
+ if (stream.get_leg_len() == 1 && *stream.get_leg_ptr() == WILDCARD) {
+ return pval;
+ } else if (stream.get_has_escapes()) {
+ stream.remove_escapes();
+ }
- pval = ((ObjectVal*)pval)->find(stream.get_legPtr(), stream.get_legLen(), handler);
+ pval = ((ObjectVal*)pval)
+ ->find(stream.get_leg_ptr(), stream.get_leg_len(), handler);
- if (!pval) return nullptr;
- } else if (LIKELY(pval->type_ == JsonbType::T_Array)) {
+ if (!pval) return nullptr;
+ } else {
+ return nullptr;
+ }
+ } else if (stream.get_type() == ARRAY_CODE) {
int index = 0;
- std::string idx_string(stream.get_legPtr(), stream.get_legLen());
+ std::string_view idx_string(stream.get_leg_ptr(), stream.get_leg_len());
- if (stream.get_legLen() == 1 && *stream.get_legPtr() == WILDCARD) {
- return pval;
- } else if (std::string(stream.get_legPtr(), 4) == LAST) {
+ if (stream.get_leg_len() == 1 && *stream.get_leg_ptr() == WILDCARD) {
+ if (LIKELY(pval->type_ == JsonbType::T_Array)) {
+ stream.skip(1);
+ stream.skip_whitespace();
+ continue;
+ } else {
+ return nullptr;
+ }
+ } else if (std::equal(LAST, LAST + 4, stream.get_leg_ptr(),
+ [](char c1, char c2) {
+ return std::tolower(c1) == std::tolower(c2);
+ }) &&
+ stream.get_leg_len() >= 4) {
auto pos = idx_string.find(MINUS);
if (pos != std::string::npos) {
idx_string = idx_string.substr(pos + 1);
- size_t num = ((ArrayVal*)pval)->numElem();
- if (std::stoi(idx_string) > num) {
- return nullptr; //invalid json path
+
+ auto result = std::from_chars(idx_string.data(),
+ idx_string.data() + idx_string.size(), index);
+ if (result.ec != std::errc()) {
+ is_invalid_json_path = true;
+ return nullptr;
}
- index = num - 1 - std::stoi(idx_string);
- } else if (stream.get_legLen() == 4) {
- index = ((ArrayVal*)pval)->numElem() - 1;
+
+ if (pval->type_ == JsonbType::T_Object) {
+ if (index == 0) {
+ continue;
+ } else {
+ return nullptr;
+ }
+ } else if (LIKELY(pval->type_ == JsonbType::T_Array)) {
+ size_t num = ((ArrayVal*)pval)->numElem();
+ if (index > num) return nullptr;
+ index = num - 1 - index;
+ } else {
+ return nullptr;
+ }
+ } else if (stream.get_leg_len() == 4) {
+ if (pval->type_ == JsonbType::T_Object) {
+ continue;
+ } else if (LIKELY(pval->type_ == JsonbType::T_Array)) {
+ index = ((ArrayVal*)pval)->numElem() - 1;
+ } else {
+ return nullptr;
+ }
+
} else {
- return nullptr; //invalid json path
+ is_invalid_json_path = true;
+ return nullptr;
}
} else {
- std::string::size_type pos;
- index = std::stoi(idx_string, &pos, 10);
- if (pos != idx_string.size()) {
- return nullptr; //invalid json path
- } else if (index >= ((ArrayVal*)pval)->numElem()) {
- return nullptr; //invalid json path
+ auto result = std::from_chars(idx_string.data(),
+ idx_string.data() + idx_string.size(), index);
+ if (result.ec != std::errc()) {
+ is_invalid_json_path = true;
+ return nullptr;
+ }
+
+ if (pval->type_ == JsonbType::T_Object) {
+ if (index == 0) {
+ continue;
+ } else {
+ return nullptr;
+ }
+ } else if (LIKELY(pval->type_ == JsonbType::T_Array)) {
+ if (std::abs(index) >= ((ArrayVal*)pval)->numElem()) return nullptr;
+ } else {
+ return nullptr;
}
}
- pval = ((ArrayVal*)pval)->get(index);
+ if (index >= 0) {
+ pval = ((ArrayVal*)pval)->get(index);
+ } else {
+ pval = ((ArrayVal*)pval)->get(((ArrayVal*)pval)->numElem() + index);
+ }
}
}
return pval;
}
-inline bool JsonbPath::parsePath(Stream* stream, JsonbValue* value) {
- if (stream->peek() == BEGIN_ARRAY && value->type() == JsonbType::T_Array) {
+inline bool JsonbPath::parsePath(Stream* stream) {
+ if (stream->peek() == BEGIN_ARRAY) {
return parse_array(stream);
- } else if (stream->peek() == BEGIN_MEMBER && value->type() == JsonbType::T_Object) {
+ } else if (stream->peek() == BEGIN_MEMBER) {
return parse_member(stream);
} else {
+ stream->set_is_invalid_json_path(true);
return false; //invalid json path
}
}
@@ -1272,30 +1353,37 @@ inline bool JsonbPath::parsePath(Stream* stream, JsonbValue* value) {
inline bool JsonbPath::parse_array(Stream* stream) {
assert(stream->peek() == BEGIN_ARRAY);
stream->skip(1);
- if (stream->exhausted()) return false; //invalid json path
+ if (stream->exhausted()) {
+ stream->set_is_invalid_json_path(true);
+ return false;
+ }
if (stream->peek() == WILDCARD) {
- stream->set_legPtr(const_cast(stream->position()));
- stream->add_legLen();
+ stream->set_leg_ptr(const_cast(stream->position()));
+ stream->add_leg_len();
stream->skip(1);
if (stream->peek() == END_ARRAY) {
+ stream->set_type(ARRAY_CODE);
return true;
} else {
- return false; //invalid json path
+ stream->set_is_invalid_json_path(true);
+ return false;
}
}
- stream->set_legPtr(const_cast(stream->position()));
+ stream->set_leg_ptr(const_cast(stream->position()));
for (; !stream->exhausted() && stream->peek() != END_ARRAY; stream->skip(1)) {
- stream->add_legLen();
+ stream->add_leg_len();
}
if (!stream->exhausted() && stream->peek() == END_ARRAY) {
stream->skip(1);
+ stream->set_type(ARRAY_CODE);
return true;
} else {
- return false; //invalid json path
+ stream->set_is_invalid_json_path(true);
+ return false;
}
}
@@ -1303,31 +1391,35 @@ inline bool JsonbPath::parse_member(Stream* stream) {
// advance past the .
assert(stream->peek() == BEGIN_MEMBER);
stream->skip(1);
- if (stream->exhausted()) return false; //invalid json path
+ if (stream->exhausted()) {
+ stream->set_is_invalid_json_path(true);
+ return false;
+ }
if (stream->peek() == WILDCARD) {
- stream->set_legPtr(const_cast(stream->position()));
- stream->add_legLen();
+ stream->set_leg_ptr(const_cast(stream->position()));
+ stream->add_leg_len();
stream->skip(1);
+ stream->set_type(MEMBER_CODE);
return true;
}
- stream->set_legPtr(const_cast(stream->position()));
+ stream->set_leg_ptr(const_cast(stream->position()));
const char* left_quotation_marks = nullptr;
const char* right_quotation_marks = nullptr;
for (; !stream->exhausted(); stream->skip(1)) {
if (stream->peek() == ESCAPE) {
- stream->add_legLen();
+ stream->add_leg_len();
stream->skip(1);
- stream->add_legLen();
- stream->set_hasEscapes(true);
+ stream->add_leg_len();
+ stream->set_has_escapes(true);
continue;
} else if (stream->peek() == DOUBLE_QUOTE) {
if (left_quotation_marks == nullptr) {
left_quotation_marks = stream->position();
- stream->set_legPtr(const_cast(++left_quotation_marks));
+ stream->set_leg_ptr(const_cast(++left_quotation_marks));
continue;
} else {
right_quotation_marks = stream->position();
@@ -1340,13 +1432,16 @@ inline bool JsonbPath::parse_member(Stream* stream) {
}
}
- stream->add_legLen();
+ stream->add_leg_len();
}
if (left_quotation_marks != nullptr && right_quotation_marks == nullptr) {
+ stream->set_is_invalid_json_path(true);
return false; //invalid json path
}
+ stream->set_type(MEMBER_CODE);
+
return true;
}
diff --git a/be/src/util/jsonb_error.h b/be/src/util/jsonb_error.h
index 2ad632fb8bd2c69..49c061a32f966de 100644
--- a/be/src/util/jsonb_error.h
+++ b/be/src/util/jsonb_error.h
@@ -56,6 +56,7 @@ enum class JsonbErrType {
E_NESTING_LVL_OVERFLOW,
E_INVALID_DOCU_COMPAT,
E_EXCEPTION,
+ E_INVALID_JSON_PATH,
// new error code should always be added above
E_NUM_ERRORS
@@ -107,6 +108,7 @@ class JsonbErrMsg {
"Object or array has too many nesting levels",
"Invalid document",
"Exception throwed",
+ "Invalid Json Path",
nullptr /* E_NUM_ERRORS */
};
diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp
index 3acdc48ff086c10..200d346dede34a6 100644
--- a/be/src/util/mem_info.cpp
+++ b/be/src/util/mem_info.cpp
@@ -41,7 +41,6 @@
#include "olap/page_cache.h"
#include "olap/rowset/segment_v2/inverted_index_cache.h"
#include "olap/segment_loader.h"
-#include "runtime/memory/chunk_allocator.h"
#include "runtime/memory/mem_tracker_limiter.h"
#include "runtime/task_group/task_group.h"
#include "runtime/task_group/task_group_manager.h"
@@ -78,14 +77,18 @@ int64_t MemInfo::_s_process_full_gc_size = -1;
void MemInfo::refresh_allocator_mem() {
#if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER)
#elif defined(USE_JEMALLOC)
+ // 'epoch' is a special mallctl -- it updates the statistics. Without it, all
+ // the following calls will return stale values. It increments and returns
+ // the current epoch number, which might be useful to log as a sanity check.
uint64_t epoch = 0;
size_t sz = sizeof(epoch);
jemallctl("epoch", &epoch, &sz, &epoch, sz);
// https://jemalloc.net/jemalloc.3.html
- _s_allocator_cache_mem =
- get_je_metrics(fmt::format("stats.arenas.{}.tcache_bytes", MALLCTL_ARENAS_ALL)) +
- get_je_metrics("stats.metadata");
+ // https://www.bookstack.cn/read/aliyun-rds-core/4a0cdf677f62feb3.md
+ _s_allocator_cache_mem = get_je_all_arena_metrics("tcache_bytes") +
+ get_je_metrics("stats.metadata") +
+ get_je_all_arena_metrics("pdirty") * get_page_size();
_s_allocator_cache_mem_str =
PrettyPrinter::print(static_cast(_s_allocator_cache_mem), TUnit::BYTES);
_s_virtual_memory_used = get_je_metrics("stats.mapped");
@@ -104,10 +107,6 @@ void MemInfo::refresh_allocator_mem() {
void MemInfo::process_cache_gc(int64_t& freed_mem) {
// TODO, free more cache, and should free a certain percentage of capacity, not all.
int32_t min_free_size = 33554432; // 32M
- if (ChunkAllocator::instance()->mem_consumption() > min_free_size) {
- freed_mem += ChunkAllocator::instance()->mem_consumption();
- ChunkAllocator::instance()->clear();
- }
if (StoragePageCache::instance()->get_page_cache_mem_consumption(segment_v2::DATA_PAGE) >
min_free_size) {
@@ -130,6 +129,7 @@ void MemInfo::process_cache_gc(int64_t& freed_mem) {
segment_v2::PRIMARY_KEY_INDEX_PAGE);
StoragePageCache::instance()->prune(segment_v2::PRIMARY_KEY_INDEX_PAGE);
}
+ je_purge_all_arena_dirty_pages();
}
// step1: free all cache
@@ -144,7 +144,8 @@ bool MemInfo::process_minor_gc() {
std::string mem_available_str = MemInfo::sys_mem_available_str();
Defer defer {[&]() {
- LOG(INFO) << fmt::format("Process Minor GC Free Memory {} Bytes. cost(us): {}", freed_mem,
+ je_purge_all_arena_dirty_pages();
+ LOG(INFO) << fmt::format("End Minor GC, Free Memory {} Bytes. cost(us): {}", freed_mem,
watch.elapsed_time() / 1000);
}};
@@ -186,7 +187,8 @@ bool MemInfo::process_full_gc() {
std::string mem_available_str = MemInfo::sys_mem_available_str();
Defer defer {[&]() {
- LOG(INFO) << fmt::format("Process Full GC Free Memory {} Bytes. cost(us): {}", freed_mem,
+ je_purge_all_arena_dirty_pages();
+ LOG(INFO) << fmt::format("End Full GC Free, Memory {} Bytes. cost(us): {}", freed_mem,
watch.elapsed_time() / 1000);
}};
diff --git a/be/src/util/mem_info.h b/be/src/util/mem_info.h
index 12c70d8cc4dc3a4..89a66b06589e355 100644
--- a/be/src/util/mem_info.h
+++ b/be/src/util/mem_info.h
@@ -26,6 +26,12 @@
#include
#include
+#if !defined(__APPLE__) || !defined(_POSIX_C_SOURCE)
+#include
+#else
+#include
+#endif
+
#include "common/logging.h"
#ifdef USE_JEMALLOC
#include "jemalloc/jemalloc.h"
@@ -46,6 +52,14 @@ class MemInfo {
static inline bool initialized() { return _s_initialized; }
+ static int get_page_size() {
+#if !defined(__APPLE__) || !defined(_POSIX_C_SOURCE)
+ return getpagesize();
+#else
+ return vm_page_size;
+#endif
+ }
+
// Get total physical memory in bytes (if has cgroups memory limits, return the limits).
static inline int64_t physical_mem() {
DCHECK(_s_initialized);
@@ -83,6 +97,22 @@ class MemInfo {
#endif
return 0;
}
+
+ static inline int64_t get_je_all_arena_metrics(const std::string& name) {
+#ifdef USE_JEMALLOC
+ return get_je_metrics(fmt::format("stats.arenas.{}.{}", MALLCTL_ARENAS_ALL, name));
+#endif
+ return 0;
+ }
+
+ static inline void je_purge_all_arena_dirty_pages() {
+#ifdef USE_JEMALLOC
+ // Purge all unused dirty pages for arena , or for all arenas if equals MALLCTL_ARENAS_ALL.
+ jemallctl(fmt::format("arena.{}.purge", MALLCTL_ARENAS_ALL).c_str(), nullptr, nullptr,
+ nullptr, 0);
+#endif
+ }
+
static inline size_t allocator_virtual_mem() { return _s_virtual_memory_used; }
static inline size_t allocator_cache_mem() { return _s_allocator_cache_mem; }
static inline std::string allocator_cache_mem_str() { return _s_allocator_cache_mem_str; }
@@ -94,6 +124,13 @@ class MemInfo {
// obtained by the process malloc, not the physical memory actually used by the process in the OS.
static void refresh_allocator_mem();
+ /** jemalloc pdirty is number of pages within unused extents that are potentially
+ * dirty, and for which madvise() or similar has not been called.
+ *
+ * So they will be subtracted from RSS to make accounting more
+ * accurate, since those pages are not really RSS but a memory
+ * that can be used at anytime via jemalloc.
+ */
static inline void refresh_proc_mem_no_allocator_cache() {
_s_proc_mem_no_allocator_cache =
PerfCounters::get_vm_rss() - static_cast(_s_allocator_cache_mem);
diff --git a/be/src/util/network_util.cpp b/be/src/util/network_util.cpp
index 20695bbfba90388..6841e257a320439 100644
--- a/be/src/util/network_util.cpp
+++ b/be/src/util/network_util.cpp
@@ -96,7 +96,15 @@ Status hostname_to_ip_addrs(const std::string& name, std::vector* a
return Status::InternalError("Could not convert IPv4 address for: {}", name);
}
- addresses->push_back(std::string(addr_buf));
+ // add address if not exists
+ std::string address = std::string(addr_buf);
+ if (std::find(addresses->begin(), addresses->end(), address) != addresses->end()) {
+ LOG(WARNING) << "Repeated ip addresses has been found for host: " << name
+ << ", ip address:" << address
+ << ", please check your network configuration";
+ } else {
+ addresses->push_back(address);
+ }
it = it->ai_next;
}
diff --git a/be/src/util/ref_count_closure.h b/be/src/util/ref_count_closure.h
index fe6efa761517b65..d2fbd2fd14e863c 100644
--- a/be/src/util/ref_count_closure.h
+++ b/be/src/util/ref_count_closure.h
@@ -31,7 +31,7 @@ template
class RefCountClosure : public google::protobuf::Closure {
public:
RefCountClosure() : _refs(0) {}
- ~RefCountClosure() {}
+ ~RefCountClosure() override = default;
void ref() { _refs.fetch_add(1); }
diff --git a/be/src/util/runtime_profile.cpp b/be/src/util/runtime_profile.cpp
index b7cd3a63af9a0d0..f385256ab210a29 100644
--- a/be/src/util/runtime_profile.cpp
+++ b/be/src/util/runtime_profile.cpp
@@ -38,8 +38,6 @@ namespace doris {
// Thread counters name
static const std::string THREAD_TOTAL_TIME = "TotalWallClockTime";
-static const std::string THREAD_USER_TIME = "UserTime";
-static const std::string THREAD_SYS_TIME = "SysTime";
static const std::string THREAD_VOLUNTARY_CONTEXT_SWITCHES = "VoluntaryContextSwitches";
static const std::string THREAD_INVOLUNTARY_CONTEXT_SWITCHES = "InvoluntaryContextSwitches";
diff --git a/be/src/util/slice.h b/be/src/util/slice.h
index babedbe893d001d..57865b50e3e65db 100644
--- a/be/src/util/slice.h
+++ b/be/src/util/slice.h
@@ -279,7 +279,7 @@ class OwnedSlice : private Allocator {
return *this;
}
- ~OwnedSlice() { Allocator::free_no_munmap(_slice.data); }
+ ~OwnedSlice() { Allocator::free(_slice.data); }
const Slice& slice() const { return _slice; }
diff --git a/be/src/util/system_metrics.cpp b/be/src/util/system_metrics.cpp
index fa8f5a181a7bec9..ee7db9494c2e970 100644
--- a/be/src/util/system_metrics.cpp
+++ b/be/src/util/system_metrics.cpp
@@ -117,6 +117,12 @@ DEFINE_MEMORY_GAUGE_METRIC(jemalloc_metadata_bytes, MetricUnit::BYTES);
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_resident_bytes, MetricUnit::BYTES);
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_mapped_bytes, MetricUnit::BYTES);
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_retained_bytes, MetricUnit::BYTES);
+DEFINE_MEMORY_GAUGE_METRIC(jemalloc_tcache_bytes, MetricUnit::BYTES);
+DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pactive_num, MetricUnit::NOUNIT);
+DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pdirty_num, MetricUnit::NOUNIT);
+DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pmuzzy_num, MetricUnit::NOUNIT);
+DEFINE_MEMORY_GAUGE_METRIC(jemalloc_dirty_purged_num, MetricUnit::NOUNIT);
+DEFINE_MEMORY_GAUGE_METRIC(jemalloc_muzzy_purged_num, MetricUnit::NOUNIT);
#endif
struct MemoryMetrics {
@@ -142,6 +148,12 @@ struct MemoryMetrics {
INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_resident_bytes);
INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_mapped_bytes);
INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_retained_bytes);
+ INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_tcache_bytes);
+ INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pactive_num);
+ INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pdirty_num);
+ INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pmuzzy_num);
+ INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_dirty_purged_num);
+ INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_muzzy_purged_num);
#endif
}
@@ -167,6 +179,12 @@ struct MemoryMetrics {
IntGauge* memory_jemalloc_resident_bytes;
IntGauge* memory_jemalloc_mapped_bytes;
IntGauge* memory_jemalloc_retained_bytes;
+ IntGauge* memory_jemalloc_tcache_bytes;
+ IntGauge* memory_jemalloc_pactive_num;
+ IntGauge* memory_jemalloc_pdirty_num;
+ IntGauge* memory_jemalloc_pmuzzy_num;
+ IntGauge* memory_jemalloc_dirty_purged_num;
+ IntGauge* memory_jemalloc_muzzy_purged_num;
#endif
};
@@ -457,6 +475,18 @@ void SystemMetrics::update_allocator_metrics() {
MemInfo::get_je_metrics("stats.mapped"));
_memory_metrics->memory_jemalloc_retained_bytes->set_value(
MemInfo::get_je_metrics("stats.retained"));
+ _memory_metrics->memory_jemalloc_tcache_bytes->set_value(
+ MemInfo::get_je_all_arena_metrics("tcache_bytes"));
+ _memory_metrics->memory_jemalloc_pactive_num->set_value(
+ MemInfo::get_je_all_arena_metrics("pactive"));
+ _memory_metrics->memory_jemalloc_pdirty_num->set_value(
+ MemInfo::get_je_all_arena_metrics("pdirty"));
+ _memory_metrics->memory_jemalloc_pmuzzy_num->set_value(
+ MemInfo::get_je_all_arena_metrics("pmuzzy"));
+ _memory_metrics->memory_jemalloc_dirty_purged_num->set_value(
+ MemInfo::get_je_all_arena_metrics("dirty_purged"));
+ _memory_metrics->memory_jemalloc_muzzy_purged_num->set_value(
+ MemInfo::get_je_all_arena_metrics("muzzy_purged"));
#else
_memory_metrics->memory_tcmalloc_allocated_bytes->set_value(
MemInfo::get_tc_metrics("generic.total_physical_bytes"));
diff --git a/be/src/vec/aggregate_functions/aggregate_function_avg.h b/be/src/vec/aggregate_functions/aggregate_function_avg.h
index 118c6b945fb992f..ff1de6b2e2761fc 100644
--- a/be/src/vec/aggregate_functions/aggregate_function_avg.h
+++ b/be/src/vec/aggregate_functions/aggregate_function_avg.h
@@ -237,8 +237,9 @@ class AggregateFunctionAvg final
IColumn& to) const override {
auto& col = assert_cast(to);
col.set_item_size(sizeof(Data));
- col.resize(1);
- *reinterpret_cast(col.get_data().data()) = this->data(place);
+ size_t old_size = col.size();
+ col.resize(old_size + 1);
+ *(reinterpret_cast(col.get_data().data()) + old_size) = this->data(place);
}
MutableColumnPtr create_serialize_column() const override {
diff --git a/be/src/vec/aggregate_functions/aggregate_function_count.h b/be/src/vec/aggregate_functions/aggregate_function_count.h
index 1cea35958ab9f52..35da6c3406fcc03 100644
--- a/be/src/vec/aggregate_functions/aggregate_function_count.h
+++ b/be/src/vec/aggregate_functions/aggregate_function_count.h
@@ -150,8 +150,9 @@ class AggregateFunctionCount final
auto& col = assert_cast(to);
DCHECK(col.item_size() == sizeof(Data))
<< "size is not equal: " << col.item_size() << " " << sizeof(Data);
- col.resize(1);
- reinterpret_cast(col.get_data().data())->count =
+ size_t old_size = col.size();
+ col.resize(old_size + 1);
+ (reinterpret_cast(col.get_data().data()) + old_size)->count =
AggregateFunctionCount::data(place).count;
}
diff --git a/be/src/vec/aggregate_functions/aggregate_function_min_max.h b/be/src/vec/aggregate_functions/aggregate_function_min_max.h
index 550e83a7e3cc204..5d3bee95c99e2f6 100644
--- a/be/src/vec/aggregate_functions/aggregate_function_min_max.h
+++ b/be/src/vec/aggregate_functions/aggregate_function_min_max.h
@@ -628,8 +628,9 @@ class AggregateFunctionsSingleValue final
IColumn& to) const override {
if constexpr (Data::IsFixedLength) {
auto& col = assert_cast(to);
- col.resize(1);
- *reinterpret_cast(col.get_data().data()) = this->data(place);
+ size_t old_size = col.size();
+ col.resize(old_size + 1);
+ *(reinterpret_cast(col.get_data().data()) + old_size) = this->data(place);
} else {
Base::serialize_without_key_to_column(place, to);
}
diff --git a/be/src/vec/aggregate_functions/aggregate_function_sum.h b/be/src/vec/aggregate_functions/aggregate_function_sum.h
index 3773efa5bb8fd9f..cab803bc7a3b821 100644
--- a/be/src/vec/aggregate_functions/aggregate_function_sum.h
+++ b/be/src/vec/aggregate_functions/aggregate_function_sum.h
@@ -182,8 +182,9 @@ class AggregateFunctionSum final
auto& col = assert_cast(to);
DCHECK(col.item_size() == sizeof(Data))
<< "size is not equal: " << col.item_size() << " " << sizeof(Data);
- col.resize(1);
- reinterpret_cast(col.get_data().data())->sum = this->data(place).sum;
+ size_t old_size = col.size();
+ col.resize(old_size + 1);
+ (reinterpret_cast(col.get_data().data()) + old_size)->sum = this->data(place).sum;
}
MutableColumnPtr create_serialize_column() const override {
diff --git a/be/src/vec/aggregate_functions/aggregate_function_window.cpp b/be/src/vec/aggregate_functions/aggregate_function_window.cpp
index bacdd45131584c1..fdc8a0bf861d634 100644
--- a/be/src/vec/aggregate_functions/aggregate_function_window.cpp
+++ b/be/src/vec/aggregate_functions/aggregate_function_window.cpp
@@ -43,7 +43,7 @@ AggregateFunctionPtr create_function_lead_lag_first_last(const String& name,
if (which.idx == TypeIndex::TYPE) \
return std::make_shared>>>(argument_types);
- TYPE_TO_BASIC_COLUMN_TYPE(DISPATCH)
+ TYPE_TO_COLUMN_TYPE(DISPATCH)
#undef DISPATCH
LOG(WARNING) << "with unknowed type, failed in create_aggregate_function_" << name
@@ -99,4 +99,4 @@ void register_aggregate_function_window_lead_lag_first_last(
factory.register_function_both("last_value", create_aggregate_function_window_last);
}
-} // namespace doris::vectorized
\ No newline at end of file
+} // namespace doris::vectorized
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index 00f1824b9976266..b291ba24430be84 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -365,7 +365,7 @@ class IColumn : public COW {
/// On subsequent calls of this method for sequence of column values of arbitrary types,
/// passed bytes to hash must identify sequence of values unambiguously.
virtual void update_hash_with_value(size_t n, SipHash& hash) const {
- LOG(FATAL) << "update_hash_with_value siphash not supported";
+ LOG(FATAL) << get_name() << " update_hash_with_value siphash not supported";
}
/// Update state of hash function with value of n elements to avoid the virtual function call
@@ -374,7 +374,7 @@ class IColumn : public COW {
/// do xxHash here, faster than other hash method
virtual void update_hashes_with_value(std::vector& hashes,
const uint8_t* __restrict null_data = nullptr) const {
- LOG(FATAL) << "update_hashes_with_value siphash not supported";
+ LOG(FATAL) << get_name() << " update_hashes_with_value siphash not supported";
}
/// Update state of hash function with value of n elements to avoid the virtual function call
@@ -383,7 +383,11 @@ class IColumn : public COW {
/// do xxHash here, faster than other sip hash
virtual void update_hashes_with_value(uint64_t* __restrict hashes,
const uint8_t* __restrict null_data = nullptr) const {
- LOG(FATAL) << "update_hashes_with_value xxhash not supported";
+ LOG(FATAL) << get_name() << " update_hashes_with_value xxhash not supported";
+ }
+
+ virtual void update_xxHash_with_value(size_t n, uint64_t& hash) const {
+ LOG(FATAL) << get_name() << " update_hash_with_value xxhash not supported";
}
/// Update state of crc32 hash function with value of n elements to avoid the virtual function call
@@ -391,7 +395,11 @@ class IColumn : public COW {
/// means all element need to do hash function, else only *null_data != 0 need to do hash func
virtual void update_crcs_with_value(std::vector& hash, PrimitiveType type,
const uint8_t* __restrict null_data = nullptr) const {
- LOG(FATAL) << "update_crcs_with_value not supported";
+ LOG(FATAL) << get_name() << "update_crcs_with_value not supported";
+ }
+
+ virtual void update_crc_with_value(size_t n, uint64_t& hash) const {
+ LOG(FATAL) << get_name() << " update_crc_with_value not supported";
}
/** Removes elements that don't match the filter.
diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp
index 4215bd36bdf6811..c5e35e5bb9b4ece 100644
--- a/be/src/vec/columns/column_array.cpp
+++ b/be/src/vec/columns/column_array.cpp
@@ -271,6 +271,69 @@ void ColumnArray::update_hash_with_value(size_t n, SipHash& hash) const {
for (size_t i = 0; i < array_size; ++i) get_data().update_hash_with_value(offset + i, hash);
}
+void ColumnArray::update_hashes_with_value(std::vector& hashes,
+ const uint8_t* __restrict null_data) const {
+ SIP_HASHES_FUNCTION_COLUMN_IMPL();
+}
+
+// for every array row calculate xxHash
+void ColumnArray::update_xxHash_with_value(size_t n, uint64_t& hash) const {
+ size_t elem_size = size_at(n);
+ size_t offset = offset_at(n);
+ hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&elem_size), sizeof(elem_size),
+ hash);
+ for (auto i = 0; i < elem_size; ++i) {
+ get_data().update_xxHash_with_value(offset + i, hash);
+ }
+}
+
+// for every array row calculate crcHash
+void ColumnArray::update_crc_with_value(size_t n, uint64_t& crc) const {
+ size_t elem_size = size_at(n);
+ size_t offset = offset_at(n);
+
+ crc = HashUtil::zlib_crc_hash(reinterpret_cast(&elem_size), sizeof(elem_size),
+ crc);
+ for (auto i = 0; i < elem_size; ++i) {
+ get_data().update_crc_with_value(offset + i, crc);
+ }
+}
+
+void ColumnArray::update_hashes_with_value(uint64_t* __restrict hashes,
+ const uint8_t* __restrict null_data) const {
+ auto s = size();
+ if (null_data) {
+ for (size_t i = 0; i < s; ++i) {
+ if (null_data[i] == 0) {
+ update_xxHash_with_value(i, hashes[i]);
+ }
+ }
+ } else {
+ for (size_t i = 0; i < s; ++i) {
+ update_xxHash_with_value(i, hashes[i]);
+ }
+ }
+}
+
+void ColumnArray::update_crcs_with_value(std::vector& hash, PrimitiveType type,
+ const uint8_t* __restrict null_data) const {
+ auto s = hash.size();
+ DCHECK(s == size());
+
+ if (null_data) {
+ for (size_t i = 0; i < s; ++i) {
+ // every row
+ if (null_data[i] == 0) {
+ update_crc_with_value(i, hash[i]);
+ }
+ }
+ } else {
+ for (size_t i = 0; i < s; ++i) {
+ update_crc_with_value(i, hash[i]);
+ }
+ }
+}
+
void ColumnArray::insert(const Field& x) {
const Array& array = doris::vectorized::get(x);
size_t size = array.size();
diff --git a/be/src/vec/columns/column_array.h b/be/src/vec/columns/column_array.h
index 4f08c269fb7e6e2..2e1c96a2c5e7f23 100644
--- a/be/src/vec/columns/column_array.h
+++ b/be/src/vec/columns/column_array.h
@@ -139,6 +139,18 @@ class ColumnArray final : public COWHelper {
StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const override;
const char* deserialize_and_insert_from_arena(const char* pos) override;
void update_hash_with_value(size_t n, SipHash& hash) const override;
+ void update_xxHash_with_value(size_t n, uint64_t& hash) const override;
+ void update_crc_with_value(size_t n, uint64_t& crc) const override;
+
+ void update_hashes_with_value(std::vector& hashes,
+ const uint8_t* __restrict null_data) const override;
+
+ void update_hashes_with_value(uint64_t* __restrict hashes,
+ const uint8_t* __restrict null_data = nullptr) const override;
+
+ void update_crcs_with_value(std::vector& hash, PrimitiveType type,
+ const uint8_t* __restrict null_data = nullptr) const override;
+
void insert_range_from(const IColumn& src, size_t start, size_t length) override;
void insert(const Field& x) override;
void insert_from(const IColumn& src_, size_t n) override;
@@ -240,6 +252,8 @@ class ColumnArray final : public COWHelper {
ColumnPtr index(const IColumn& indexes, size_t limit) const override;
private:
+ // [[2,1,5,9,1], [1,2,4]] --> data column [2,1,5,9,1,1,2,4], offset[-1] = 0, offset[0] = 5, offset[1] = 8
+ // [[[2,1,5],[9,1]], [[1,2]]] --> data column [3 column array], offset[-1] = 0, offset[0] = 2, offset[1] = 3
WrappedPtr data;
WrappedPtr offsets;
diff --git a/be/src/vec/columns/column_const.h b/be/src/vec/columns/column_const.h
index bb17f7eb0417a3e..feeb0608a26d048 100644
--- a/be/src/vec/columns/column_const.h
+++ b/be/src/vec/columns/column_const.h
@@ -152,6 +152,19 @@ class ColumnConst final : public COWHelper {
data->serialize_vec(keys, num_rows, max_row_byte_size);
}
+ void update_xxHash_with_value(size_t n, uint64_t& hash) const override {
+ auto real_data = data->get_data_at(0);
+ if (real_data.data == nullptr) {
+ hash = HashUtil::xxHash64NullWithSeed(hash);
+ } else {
+ hash = HashUtil::xxHash64WithSeed(real_data.data, real_data.size, hash);
+ }
+ }
+
+ void update_crc_with_value(size_t n, uint64_t& crc) const override {
+ get_data_column_ptr()->update_crc_with_value(n, crc);
+ }
+
void serialize_vec_with_null_map(std::vector& keys, size_t num_rows,
const uint8_t* null_map,
size_t max_row_byte_size) const override {
@@ -165,6 +178,7 @@ class ColumnConst final : public COWHelper {
void update_hashes_with_value(std::vector& hashes,
const uint8_t* __restrict null_data) const override;
+ // (TODO.Amory) here may not use column_const update hash, and PrimitiveType is not used.
void update_crcs_with_value(std::vector& hashes, PrimitiveType type,
const uint8_t* __restrict null_data) const override;
diff --git a/be/src/vec/columns/column_decimal.cpp b/be/src/vec/columns/column_decimal.cpp
index 069f195c4a86e9b..e0b8fef05624470 100644
--- a/be/src/vec/columns/column_decimal.cpp
+++ b/be/src/vec/columns/column_decimal.cpp
@@ -137,6 +137,19 @@ void ColumnDecimal::update_hashes_with_value(std::vector& hashes,
SIP_HASHES_FUNCTION_COLUMN_IMPL();
}
+template