diff --git a/src/duckdb/extension/parquet/include/parquet-extension.hpp b/src/duckdb/extension/parquet/include/parquet-extension.hpp deleted file mode 100644 index d24eeb6a8..000000000 --- a/src/duckdb/extension/parquet/include/parquet-extension.hpp +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include "duckdb.hpp" - -namespace duckdb { - -class ParquetExtension : public Extension { -public: - void Load(DuckDB &db) override; - std::string Name() override; -}; - -} // namespace duckdb diff --git a/src/duckdb/extension/parquet/parquet-extension.cpp b/src/duckdb/extension/parquet/parquet-extension.cpp deleted file mode 100644 index cc1a59c12..000000000 --- a/src/duckdb/extension/parquet/parquet-extension.cpp +++ /dev/null @@ -1,824 +0,0 @@ -#define DUCKDB_EXTENSION_MAIN - -#include "parquet-extension.hpp" - -#include "duckdb.hpp" -#include "parquet_metadata.hpp" -#include "parquet_reader.hpp" -#include "parquet_writer.hpp" -#include "zstd_file_system.hpp" - -#include -#include -#include -#include -#include -#ifndef DUCKDB_AMALGAMATION -#include "duckdb/catalog/catalog.hpp" -#include "duckdb/common/constants.hpp" -#include "duckdb/common/enums/file_compression_type.hpp" -#include "duckdb/common/field_writer.hpp" -#include "duckdb/common/file_system.hpp" -#include "duckdb/common/types/chunk_collection.hpp" -#include "duckdb/function/copy_function.hpp" -#include "duckdb/function/table_function.hpp" -#include "duckdb/main/client_context.hpp" -#include "duckdb/main/config.hpp" -#include "duckdb/parser/expression/constant_expression.hpp" -#include "duckdb/parser/expression/function_expression.hpp" -#include "duckdb/parser/parsed_data/create_copy_function_info.hpp" -#include "duckdb/parser/parsed_data/create_table_function_info.hpp" -#include "duckdb/parser/tableref/table_function_ref.hpp" -#include "duckdb/planner/operator/logical_get.hpp" -#include "duckdb/storage/statistics/base_statistics.hpp" -#include "duckdb/catalog/catalog_entry/table_function_catalog_entry.hpp" -#include "duckdb/common/multi_file_reader.hpp" -#include "duckdb/storage/table/row_group.hpp" -#include "duckdb/main/extension_util.hpp" -#endif - -namespace duckdb { - -struct ParquetReadBindData : public TableFunctionData { - shared_ptr initial_reader; - vector files; - atomic chunk_count; - atomic cur_file; - vector names; - vector types; - - // The union readers are created (when parquet union_by_name option is on) during binding - // Those readers can be re-used during ParquetParallelStateNext - vector> union_readers; - - // These come from the initial_reader, but need to be stored in case the initial_reader is removed by a filter - idx_t initial_file_cardinality; - idx_t initial_file_row_groups; - ParquetOptions parquet_options; - MultiFileReaderBindData reader_bind; - - void Initialize(shared_ptr reader) { - initial_reader = std::move(reader); - initial_file_cardinality = initial_reader->NumRows(); - initial_file_row_groups = initial_reader->NumRowGroups(); - parquet_options = initial_reader->parquet_options; - } -}; - -struct ParquetReadLocalState : public LocalTableFunctionState { - shared_ptr reader; - ParquetReaderScanState scan_state; - bool is_parallel; - idx_t batch_index; - idx_t file_index; - //! The DataChunk containing all read columns (even filter columns that are immediately removed) - DataChunk all_columns; -}; - -struct ParquetReadGlobalState : public GlobalTableFunctionState { - mutex lock; - - //! The initial reader from the bind phase - shared_ptr initial_reader; - //! Currently opened readers - vector> readers; - //! Flag to indicate a file is being opened - vector file_opening; - //! Mutexes to wait for a file that is currently being opened - unique_ptr file_mutexes; - //! Signal to other threads that a file failed to open, letting every thread abort. - bool error_opening_file = false; - - //! Index of file currently up for scanning - idx_t file_index; - //! Index of row group within file currently up for scanning - idx_t row_group_index; - //! Batch index of the next row group to be scanned - idx_t batch_index; - - idx_t max_threads; - vector projection_ids; - vector scanned_types; - vector column_ids; - TableFilterSet *filters; - - idx_t MaxThreads() const override { - return max_threads; - } - - bool CanRemoveFilterColumns() const { - return !projection_ids.empty(); - } -}; - -struct ParquetWriteBindData : public TableFunctionData { - vector sql_types; - vector column_names; - duckdb_parquet::format::CompressionCodec::type codec = duckdb_parquet::format::CompressionCodec::SNAPPY; - idx_t row_group_size = RowGroup::ROW_GROUP_SIZE; -}; - -struct ParquetWriteGlobalState : public GlobalFunctionData { - unique_ptr writer; -}; - -struct ParquetWriteLocalState : public LocalFunctionData { - explicit ParquetWriteLocalState(ClientContext &context, const vector &types) - : buffer(Allocator::Get(context), types) { - } - - ColumnDataCollection buffer; -}; - -void ParquetOptions::Serialize(FieldWriter &writer) const { - writer.WriteField(binary_as_string); - writer.WriteField(file_row_number); - writer.WriteSerializable(file_options); -} - -void ParquetOptions::Deserialize(FieldReader &reader) { - binary_as_string = reader.ReadRequired(); - file_row_number = reader.ReadRequired(); - file_options = reader.ReadRequiredSerializable(); -} - -BindInfo ParquetGetBatchInfo(const FunctionData *bind_data) { - auto bind_info = BindInfo(ScanType::PARQUET); - auto &parquet_bind = bind_data->Cast(); - vector file_path; - for (auto &path : parquet_bind.files) { - file_path.emplace_back(path); - } - bind_info.InsertOption("file_path", Value::LIST(LogicalType::VARCHAR, file_path)); - bind_info.InsertOption("binary_as_string", Value::BOOLEAN(parquet_bind.parquet_options.binary_as_string)); - bind_info.InsertOption("file_row_number", Value::BOOLEAN(parquet_bind.parquet_options.file_row_number)); - parquet_bind.parquet_options.file_options.AddBatchInfo(bind_info); - return bind_info; -} - -class ParquetScanFunction { -public: - static TableFunctionSet GetFunctionSet() { - TableFunction table_function("parquet_scan", {LogicalType::VARCHAR}, ParquetScanImplementation, ParquetScanBind, - ParquetScanInitGlobal, ParquetScanInitLocal); - table_function.statistics = ParquetScanStats; - table_function.cardinality = ParquetCardinality; - table_function.table_scan_progress = ParquetProgress; - table_function.named_parameters["binary_as_string"] = LogicalType::BOOLEAN; - table_function.named_parameters["file_row_number"] = LogicalType::BOOLEAN; - table_function.named_parameters["compression"] = LogicalType::VARCHAR; - MultiFileReader::AddParameters(table_function); - table_function.get_batch_index = ParquetScanGetBatchIndex; - table_function.serialize = ParquetScanSerialize; - table_function.deserialize = ParquetScanDeserialize; - table_function.get_batch_info = ParquetGetBatchInfo; - - table_function.projection_pushdown = true; - table_function.filter_pushdown = true; - table_function.filter_prune = true; - table_function.pushdown_complex_filter = ParquetComplexFilterPushdown; - return MultiFileReader::CreateFunctionSet(table_function); - } - - static unique_ptr ParquetReadBind(ClientContext &context, CopyInfo &info, - vector &expected_names, - vector &expected_types) { - D_ASSERT(expected_names.size() == expected_types.size()); - ParquetOptions parquet_options(context); - - for (auto &option : info.options) { - auto loption = StringUtil::Lower(option.first); - if (loption == "compression" || loption == "codec") { - // CODEC option has no effect on parquet read: we determine codec from the file - continue; - } else if (loption == "binary_as_string") { - parquet_options.binary_as_string = true; - } else if (loption == "file_row_number") { - parquet_options.file_row_number = true; - } else { - throw NotImplementedException("Unsupported option for COPY FROM parquet: %s", option.first); - } - } - - auto files = MultiFileReader::GetFileList(context, Value(info.file_path), "Parquet"); - return ParquetScanBindInternal(context, std::move(files), expected_types, expected_names, parquet_options); - } - - static unique_ptr ParquetScanStats(ClientContext &context, const FunctionData *bind_data_p, - column_t column_index) { - auto &bind_data = bind_data_p->Cast(); - - if (IsRowIdColumnId(column_index)) { - return nullptr; - } - - // NOTE: we do not want to parse the Parquet metadata for the sole purpose of getting column statistics - - auto &config = DBConfig::GetConfig(context); - if (bind_data.files.size() < 2) { - if (bind_data.initial_reader) { - // most common path, scanning single parquet file - return bind_data.initial_reader->ReadStatistics(bind_data.names[column_index]); - } else if (!config.options.object_cache_enable) { - // our initial reader was reset - return nullptr; - } - } else if (config.options.object_cache_enable) { - // multiple files, object cache enabled: merge statistics - unique_ptr overall_stats; - - auto &cache = ObjectCache::GetObjectCache(context); - // for more than one file, we could be lucky and metadata for *every* file is in the object cache (if - // enabled at all) - FileSystem &fs = FileSystem::GetFileSystem(context); - - for (idx_t file_idx = 0; file_idx < bind_data.files.size(); file_idx++) { - auto &file_name = bind_data.files[file_idx]; - auto metadata = cache.Get(file_name); - if (!metadata) { - // missing metadata entry in cache, no usable stats - return nullptr; - } - auto handle = fs.OpenFile(file_name, FileFlags::FILE_FLAGS_READ); - // we need to check if the metadata cache entries are current - if (fs.GetLastModifiedTime(*handle) >= metadata->read_time) { - // missing or invalid metadata entry in cache, no usable stats overall - return nullptr; - } - ParquetReader reader(context, bind_data.parquet_options, metadata); - // get and merge stats for file - auto file_stats = reader.ReadStatistics(bind_data.names[column_index]); - if (!file_stats) { - return nullptr; - } - if (overall_stats) { - overall_stats->Merge(*file_stats); - } else { - overall_stats = std::move(file_stats); - } - } - // success! - return overall_stats; - } - - // multiple files and no object cache, no luck! - return nullptr; - } - - static unique_ptr ParquetScanBindInternal(ClientContext &context, vector files, - vector &return_types, vector &names, - ParquetOptions parquet_options) { - auto result = make_uniq(); - result->files = std::move(files); - result->reader_bind = - MultiFileReader::BindReader(context, result->types, result->names, *result, parquet_options); - if (return_types.empty()) { - // no expected types - just copy the types - return_types = result->types; - names = result->names; - } else { - if (return_types.size() != result->types.size()) { - throw std::runtime_error(StringUtil::Format( - "Failed to read file \"%s\" - column count mismatch: expected %d columns but found %d", - result->files[0], return_types.size(), result->types.size())); - } - // expected types - overwrite the types we want to read instead - result->types = return_types; - } - return std::move(result); - } - - static unique_ptr ParquetScanBind(ClientContext &context, TableFunctionBindInput &input, - vector &return_types, vector &names) { - auto files = MultiFileReader::GetFileList(context, input.inputs[0], "Parquet"); - ParquetOptions parquet_options(context); - for (auto &kv : input.named_parameters) { - auto loption = StringUtil::Lower(kv.first); - if (MultiFileReader::ParseOption(kv.first, kv.second, parquet_options.file_options)) { - continue; - } - if (loption == "binary_as_string") { - parquet_options.binary_as_string = BooleanValue::Get(kv.second); - } else if (loption == "file_row_number") { - parquet_options.file_row_number = BooleanValue::Get(kv.second); - } - } - if (parquet_options.file_options.auto_detect_hive_partitioning) { - parquet_options.file_options.hive_partitioning = MultiFileReaderOptions::AutoDetectHivePartitioning(files); - } - return ParquetScanBindInternal(context, std::move(files), return_types, names, parquet_options); - } - - static double ParquetProgress(ClientContext &context, const FunctionData *bind_data_p, - const GlobalTableFunctionState *global_state) { - auto &bind_data = bind_data_p->Cast(); - if (bind_data.files.empty()) { - return 100.0; - } - if (bind_data.initial_file_cardinality == 0) { - return (100.0 * (bind_data.cur_file + 1)) / bind_data.files.size(); - } - auto percentage = (bind_data.chunk_count * STANDARD_VECTOR_SIZE * 100.0 / bind_data.initial_file_cardinality) / - bind_data.files.size(); - percentage += 100.0 * bind_data.cur_file / bind_data.files.size(); - return percentage; - } - - static unique_ptr - ParquetScanInitLocal(ExecutionContext &context, TableFunctionInitInput &input, GlobalTableFunctionState *gstate_p) { - auto &bind_data = input.bind_data->Cast(); - auto &gstate = gstate_p->Cast(); - - auto result = make_uniq(); - result->is_parallel = true; - result->batch_index = 0; - if (input.CanRemoveFilterColumns()) { - result->all_columns.Initialize(context.client, gstate.scanned_types); - } - if (!ParquetParallelStateNext(context.client, bind_data, *result, gstate)) { - return nullptr; - } - return std::move(result); - } - - static unique_ptr ParquetScanInitGlobal(ClientContext &context, - TableFunctionInitInput &input) { - auto &bind_data = input.bind_data->CastNoConst(); - auto result = make_uniq(); - - result->file_opening = vector(bind_data.files.size(), false); - result->file_mutexes = unique_ptr(new mutex[bind_data.files.size()]); - if (bind_data.files.empty()) { - result->initial_reader = nullptr; - } else { - result->readers = std::move(bind_data.union_readers); - if (result->readers.size() != bind_data.files.size()) { - result->readers = vector>(bind_data.files.size(), nullptr); - } - if (bind_data.initial_reader) { - result->initial_reader = std::move(bind_data.initial_reader); - result->readers[0] = result->initial_reader; - } else if (result->readers[0]) { - result->initial_reader = result->readers[0]; - } else { - result->initial_reader = - make_shared(context, bind_data.files[0], bind_data.parquet_options); - result->readers[0] = result->initial_reader; - } - } - for (auto &reader : result->readers) { - if (!reader) { - continue; - } - MultiFileReader::InitializeReader(*reader, bind_data.parquet_options.file_options, bind_data.reader_bind, - bind_data.types, bind_data.names, input.column_ids, input.filters, - bind_data.files[0]); - } - - result->column_ids = input.column_ids; - result->filters = input.filters.get(); - result->row_group_index = 0; - result->file_index = 0; - result->batch_index = 0; - result->max_threads = ParquetScanMaxThreads(context, input.bind_data.get()); - if (input.CanRemoveFilterColumns()) { - result->projection_ids = input.projection_ids; - const auto table_types = bind_data.types; - for (const auto &col_idx : input.column_ids) { - if (IsRowIdColumnId(col_idx)) { - result->scanned_types.emplace_back(LogicalType::ROW_TYPE); - } else { - result->scanned_types.push_back(table_types[col_idx]); - } - } - } - return std::move(result); - } - - static idx_t ParquetScanGetBatchIndex(ClientContext &context, const FunctionData *bind_data_p, - LocalTableFunctionState *local_state, - GlobalTableFunctionState *global_state) { - auto &data = local_state->Cast(); - return data.batch_index; - } - - static void ParquetScanSerialize(FieldWriter &writer, const FunctionData *bind_data_p, - const TableFunction &function) { - auto &bind_data = bind_data_p->Cast(); - writer.WriteList(bind_data.files); - writer.WriteRegularSerializableList(bind_data.types); - writer.WriteList(bind_data.names); - bind_data.parquet_options.Serialize(writer); - } - - static unique_ptr ParquetScanDeserialize(PlanDeserializationState &state, FieldReader &reader, - TableFunction &function) { - auto &context = state.context; - auto files = reader.ReadRequiredList(); - auto types = reader.ReadRequiredSerializableList(); - auto names = reader.ReadRequiredList(); - ParquetOptions options(context); - options.Deserialize(reader); - - return ParquetScanBindInternal(context, files, types, names, options); - } - - static void ParquetScanImplementation(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { - if (!data_p.local_state) { - return; - } - auto &data = data_p.local_state->Cast(); - auto &gstate = data_p.global_state->Cast(); - auto &bind_data = data_p.bind_data->CastNoConst(); - - do { - if (gstate.CanRemoveFilterColumns()) { - data.all_columns.Reset(); - data.reader->Scan(data.scan_state, data.all_columns); - MultiFileReader::FinalizeChunk(bind_data.reader_bind, data.reader->reader_data, data.all_columns); - output.ReferenceColumns(data.all_columns, gstate.projection_ids); - } else { - data.reader->Scan(data.scan_state, output); - MultiFileReader::FinalizeChunk(bind_data.reader_bind, data.reader->reader_data, output); - } - - bind_data.chunk_count++; - if (output.size() > 0) { - return; - } - if (!ParquetParallelStateNext(context, bind_data, data, gstate)) { - return; - } - } while (true); - } - - static unique_ptr ParquetCardinality(ClientContext &context, const FunctionData *bind_data) { - auto &data = bind_data->Cast(); - return make_uniq(data.initial_file_cardinality * data.files.size()); - } - - static idx_t ParquetScanMaxThreads(ClientContext &context, const FunctionData *bind_data) { - auto &data = bind_data->Cast(); - return data.initial_file_row_groups * data.files.size(); - } - - // This function looks for the next available row group. If not available, it will open files from bind_data.files - // until there is a row group available for scanning or the files runs out - static bool ParquetParallelStateNext(ClientContext &context, const ParquetReadBindData &bind_data, - ParquetReadLocalState &scan_data, ParquetReadGlobalState ¶llel_state) { - unique_lock parallel_lock(parallel_state.lock); - - while (true) { - if (parallel_state.error_opening_file) { - return false; - } - - if (parallel_state.file_index >= parallel_state.readers.size()) { - return false; - } - - D_ASSERT(parallel_state.initial_reader); - - if (parallel_state.readers[parallel_state.file_index]) { - if (parallel_state.row_group_index < - parallel_state.readers[parallel_state.file_index]->NumRowGroups()) { - // The current reader has rowgroups left to be scanned - scan_data.reader = parallel_state.readers[parallel_state.file_index]; - vector group_indexes {parallel_state.row_group_index}; - scan_data.reader->InitializeScan(scan_data.scan_state, group_indexes); - scan_data.batch_index = parallel_state.batch_index++; - scan_data.file_index = parallel_state.file_index; - parallel_state.row_group_index++; - return true; - } else { - // Set state to the next file - parallel_state.file_index++; - parallel_state.row_group_index = 0; - - parallel_state.readers[parallel_state.file_index - 1] = nullptr; - - if (parallel_state.file_index >= bind_data.files.size()) { - return false; - } - continue; - } - } - - if (TryOpenNextFile(context, bind_data, scan_data, parallel_state, parallel_lock)) { - continue; - } - - // Check if the current file is being opened, in that case we need to wait for it. - if (!parallel_state.readers[parallel_state.file_index] && - parallel_state.file_opening[parallel_state.file_index]) { - WaitForFile(parallel_state.file_index, parallel_state, parallel_lock); - } - } - } - - static void ParquetComplexFilterPushdown(ClientContext &context, LogicalGet &get, FunctionData *bind_data_p, - vector> &filters) { - auto &data = bind_data_p->Cast(); - auto reset_reader = MultiFileReader::ComplexFilterPushdown(context, data.files, - data.parquet_options.file_options, get, filters); - if (reset_reader) { - MultiFileReader::PruneReaders(data); - } - } - - //! Wait for a file to become available. Parallel lock should be locked when calling. - static void WaitForFile(idx_t file_index, ParquetReadGlobalState ¶llel_state, - unique_lock ¶llel_lock) { - while (true) { - // To get the file lock, we first need to release the parallel_lock to prevent deadlocking - parallel_lock.unlock(); - unique_lock current_file_lock(parallel_state.file_mutexes[file_index]); - parallel_lock.lock(); - - // Here we have both locks which means we can stop waiting if: - // - the thread opening the file is done and the file is available - // - the thread opening the file has failed - // - the file was somehow scanned till the end while we were waiting - if (parallel_state.file_index >= parallel_state.readers.size() || - parallel_state.readers[parallel_state.file_index] || parallel_state.error_opening_file) { - return; - } - } - } - - //! Helper function that try to start opening a next file. Parallel lock should be locked when calling. - static bool TryOpenNextFile(ClientContext &context, const ParquetReadBindData &bind_data, - ParquetReadLocalState &scan_data, ParquetReadGlobalState ¶llel_state, - unique_lock ¶llel_lock) { - for (idx_t i = parallel_state.file_index; i < bind_data.files.size(); i++) { - if (!parallel_state.readers[i] && parallel_state.file_opening[i] == false) { - string file = bind_data.files[i]; - parallel_state.file_opening[i] = true; - auto pq_options = parallel_state.initial_reader->parquet_options; - - // Now we switch which lock we are holding, instead of locking the global state, we grab the lock on - // the file we are opening. This file lock allows threads to wait for a file to be opened. - parallel_lock.unlock(); - - unique_lock file_lock(parallel_state.file_mutexes[i]); - - shared_ptr reader; - try { - reader = make_shared(context, file, pq_options); - MultiFileReader::InitializeReader( - *reader, bind_data.parquet_options.file_options, bind_data.reader_bind, bind_data.types, - bind_data.names, parallel_state.column_ids, parallel_state.filters, bind_data.files.front()); - } catch (...) { - parallel_lock.lock(); - parallel_state.error_opening_file = true; - throw; - } - - // Now re-lock the state and add the reader - parallel_lock.lock(); - parallel_state.readers[i] = reader; - - return true; - } - } - - return false; - } -}; - -unique_ptr ParquetWriteBind(ClientContext &context, CopyInfo &info, vector &names, - vector &sql_types) { - auto bind_data = make_uniq(); - for (auto &option : info.options) { - auto loption = StringUtil::Lower(option.first); - if (loption == "row_group_size" || loption == "chunk_size") { - bind_data->row_group_size = option.second[0].GetValue(); - } else if (loption == "compression" || loption == "codec") { - if (!option.second.empty()) { - auto roption = StringUtil::Lower(option.second[0].ToString()); - if (roption == "uncompressed") { - bind_data->codec = duckdb_parquet::format::CompressionCodec::UNCOMPRESSED; - continue; - } else if (roption == "snappy") { - bind_data->codec = duckdb_parquet::format::CompressionCodec::SNAPPY; - continue; - } else if (roption == "gzip") { - bind_data->codec = duckdb_parquet::format::CompressionCodec::GZIP; - continue; - } else if (roption == "zstd") { - bind_data->codec = duckdb_parquet::format::CompressionCodec::ZSTD; - continue; - } - } - throw ParserException("Expected %s argument to be either [uncompressed, snappy, gzip or zstd]", loption); - } else { - throw NotImplementedException("Unrecognized option for PARQUET: %s", option.first.c_str()); - } - } - bind_data->sql_types = sql_types; - bind_data->column_names = names; - return std::move(bind_data); -} - -unique_ptr ParquetWriteInitializeGlobal(ClientContext &context, FunctionData &bind_data, - const string &file_path) { - auto global_state = make_uniq(); - auto &parquet_bind = bind_data.Cast(); - - auto &fs = FileSystem::GetFileSystem(context); - global_state->writer = - make_uniq(fs, file_path, parquet_bind.sql_types, parquet_bind.column_names, parquet_bind.codec); - return std::move(global_state); -} - -void ParquetWriteSink(ExecutionContext &context, FunctionData &bind_data_p, GlobalFunctionData &gstate, - LocalFunctionData &lstate, DataChunk &input) { - auto &bind_data = bind_data_p.Cast(); - auto &global_state = gstate.Cast(); - auto &local_state = lstate.Cast(); - - // append data to the local (buffered) chunk collection - local_state.buffer.Append(input); - if (local_state.buffer.Count() > bind_data.row_group_size) { - // if the chunk collection exceeds a certain size we flush it to the parquet file - global_state.writer->Flush(local_state.buffer); - // and reset the buffer - local_state.buffer.Reset(); - } -} - -void ParquetWriteCombine(ExecutionContext &context, FunctionData &bind_data, GlobalFunctionData &gstate, - LocalFunctionData &lstate) { - auto &global_state = gstate.Cast(); - auto &local_state = lstate.Cast(); - // flush any data left in the local state to the file - global_state.writer->Flush(local_state.buffer); -} - -void ParquetWriteFinalize(ClientContext &context, FunctionData &bind_data, GlobalFunctionData &gstate) { - auto &global_state = gstate.Cast(); - // finalize: write any additional metadata to the file here - global_state.writer->Finalize(); -} - -unique_ptr ParquetWriteInitializeLocal(ExecutionContext &context, FunctionData &bind_data_p) { - auto &bind_data = bind_data_p.Cast(); - return make_uniq(context.client, bind_data.sql_types); -} - -// LCOV_EXCL_START -static void ParquetCopySerialize(FieldWriter &writer, const FunctionData &bind_data_p, const CopyFunction &function) { - auto &bind_data = bind_data_p.Cast(); - writer.WriteRegularSerializableList(bind_data.sql_types); - writer.WriteList(bind_data.column_names); - writer.WriteField(bind_data.codec); - writer.WriteField(bind_data.row_group_size); -} - -static unique_ptr ParquetCopyDeserialize(ClientContext &context, FieldReader &reader, - CopyFunction &function) { - unique_ptr data = make_uniq(); - - data->sql_types = reader.ReadRequiredSerializableList(); - data->column_names = reader.ReadRequiredList(); - data->codec = reader.ReadRequired(); - data->row_group_size = reader.ReadRequired(); - - return std::move(data); -} -// LCOV_EXCL_STOP - -//===--------------------------------------------------------------------===// -// Execution Mode -//===--------------------------------------------------------------------===// -CopyFunctionExecutionMode ParquetWriteExecutionMode(bool preserve_insertion_order, bool supports_batch_index) { - if (!preserve_insertion_order) { - return CopyFunctionExecutionMode::PARALLEL_COPY_TO_FILE; - } - if (supports_batch_index) { - return CopyFunctionExecutionMode::BATCH_COPY_TO_FILE; - } - return CopyFunctionExecutionMode::REGULAR_COPY_TO_FILE; -} -//===--------------------------------------------------------------------===// -// Prepare Batch -//===--------------------------------------------------------------------===// -struct ParquetWriteBatchData : public PreparedBatchData { - PreparedRowGroup prepared_row_group; -}; - -unique_ptr ParquetWritePrepareBatch(ClientContext &context, FunctionData &bind_data, - GlobalFunctionData &gstate, - unique_ptr collection) { - auto &global_state = gstate.Cast(); - auto result = make_uniq(); - global_state.writer->PrepareRowGroup(*collection, result->prepared_row_group); - return std::move(result); -} - -//===--------------------------------------------------------------------===// -// Flush Batch -//===--------------------------------------------------------------------===// -void ParquetWriteFlushBatch(ClientContext &context, FunctionData &bind_data, GlobalFunctionData &gstate, - PreparedBatchData &batch_p) { - auto &global_state = gstate.Cast(); - auto &batch = batch_p.Cast(); - global_state.writer->FlushRowGroup(batch.prepared_row_group); -} - -//===--------------------------------------------------------------------===// -// Desired Batch Size -//===--------------------------------------------------------------------===// -idx_t ParquetWriteDesiredBatchSize(ClientContext &context, FunctionData &bind_data_p) { - auto &bind_data = bind_data_p.Cast(); - return bind_data.row_group_size; -} - -//===--------------------------------------------------------------------===// -// Scan Replacement -//===--------------------------------------------------------------------===// -unique_ptr ParquetScanReplacement(ClientContext &context, const string &table_name, - ReplacementScanData *data) { - auto lower_name = StringUtil::Lower(table_name); - if (!StringUtil::EndsWith(lower_name, ".parquet") && !StringUtil::Contains(lower_name, ".parquet?")) { - return nullptr; - } - auto table_function = make_uniq(); - vector> children; - children.push_back(make_uniq(Value(table_name))); - table_function->function = make_uniq("parquet_scan", std::move(children)); - - if (!FileSystem::HasGlob(table_name)) { - table_function->alias = FileSystem::ExtractBaseName(table_name); - } - - return std::move(table_function); -} - -void ParquetExtension::Load(DuckDB &db) { - auto &db_instance = *db.instance; - auto &fs = db.GetFileSystem(); - fs.RegisterSubSystem(FileCompressionType::ZSTD, make_uniq()); - - auto scan_fun = ParquetScanFunction::GetFunctionSet(); - scan_fun.name = "read_parquet"; - ExtensionUtil::RegisterFunction(db_instance, scan_fun); - scan_fun.name = "parquet_scan"; - ExtensionUtil::RegisterFunction(db_instance, scan_fun); - - // parquet_metadata - ParquetMetaDataFunction meta_fun; - ExtensionUtil::RegisterFunction(db_instance, MultiFileReader::CreateFunctionSet(meta_fun)); - - // parquet_schema - ParquetSchemaFunction schema_fun; - ExtensionUtil::RegisterFunction(db_instance, MultiFileReader::CreateFunctionSet(schema_fun)); - - CopyFunction function("parquet"); - function.copy_to_bind = ParquetWriteBind; - function.copy_to_initialize_global = ParquetWriteInitializeGlobal; - function.copy_to_initialize_local = ParquetWriteInitializeLocal; - function.copy_to_sink = ParquetWriteSink; - function.copy_to_combine = ParquetWriteCombine; - function.copy_to_finalize = ParquetWriteFinalize; - function.execution_mode = ParquetWriteExecutionMode; - function.copy_from_bind = ParquetScanFunction::ParquetReadBind; - function.copy_from_function = scan_fun.functions[0]; - function.prepare_batch = ParquetWritePrepareBatch; - function.flush_batch = ParquetWriteFlushBatch; - function.desired_batch_size = ParquetWriteDesiredBatchSize; - function.serialize = ParquetCopySerialize; - function.deserialize = ParquetCopyDeserialize; - - function.extension = "parquet"; - ExtensionUtil::RegisterFunction(db_instance, function); - - auto &config = DBConfig::GetConfig(*db.instance); - config.replacement_scans.emplace_back(ParquetScanReplacement); - config.AddExtensionOption("binary_as_string", "In Parquet files, interpret binary data as a string.", - LogicalType::BOOLEAN); -} - -std::string ParquetExtension::Name() { - return "parquet"; -} - -} // namespace duckdb - -#ifdef DUCKDB_BUILD_LOADABLE_EXTENSION -extern "C" { - -DUCKDB_EXTENSION_API void parquet_init(duckdb::DatabaseInstance &db) { // NOLINT - duckdb::DuckDB db_wrapper(db); - db_wrapper.LoadExtension(); -} - -DUCKDB_EXTENSION_API const char *parquet_version() { // NOLINT - return duckdb::DuckDB::LibraryVersion(); -} -} -#endif - -#ifndef DUCKDB_EXTENSION_MAIN -#error DUCKDB_EXTENSION_MAIN not defined -#endif diff --git a/src/duckdb/src/execution/index/art/leaf_segment.cpp b/src/duckdb/src/execution/index/art/leaf_segment.cpp deleted file mode 100644 index 89d3d7c74..000000000 --- a/src/duckdb/src/execution/index/art/leaf_segment.cpp +++ /dev/null @@ -1,52 +0,0 @@ -#include "duckdb/execution/index/art/leaf_segment.hpp" - -#include "duckdb/execution/index/art/art.hpp" -#include "duckdb/execution/index/art/node.hpp" - -namespace duckdb { - -LeafSegment &LeafSegment::New(ART &art, Node &node) { - - node.SetPtr(Node::GetAllocator(art, NType::LEAF_SEGMENT).New()); - node.type = (uint8_t)NType::LEAF_SEGMENT; - - auto &segment = LeafSegment::Get(art, node); - segment.next.Reset(); - return segment; -} - -void LeafSegment::Free(ART &art, Node &node) { - - D_ASSERT(node.IsSet()); - D_ASSERT(!node.IsSwizzled()); - - // free next segment - auto next_segment = LeafSegment::Get(art, node).next; - Node::Free(art, next_segment); -} - -LeafSegment &LeafSegment::Append(ART &art, uint32_t &count, const row_t row_id) { - - reference segment(*this); - auto position = count % Node::LEAF_SEGMENT_SIZE; - - // we need a new segment - if (position == 0 && count != 0) { - segment = LeafSegment::New(art, next); - } - - segment.get().row_ids[position] = row_id; - count++; - return segment.get(); -} - -LeafSegment &LeafSegment::GetTail(const ART &art) { - - reference segment(*this); - while (segment.get().next.IsSet()) { - segment = LeafSegment::Get(art, segment.get().next); - } - return segment.get(); -} - -} // namespace duckdb diff --git a/src/duckdb/src/execution/index/art/prefix_segment.cpp b/src/duckdb/src/execution/index/art/prefix_segment.cpp deleted file mode 100644 index 94cf9d625..000000000 --- a/src/duckdb/src/execution/index/art/prefix_segment.cpp +++ /dev/null @@ -1,42 +0,0 @@ -#include "duckdb/execution/index/art/prefix_segment.hpp" - -#include "duckdb/execution/index/art/art.hpp" -#include "duckdb/execution/index/art/node.hpp" - -namespace duckdb { - -PrefixSegment &PrefixSegment::New(ART &art, Node &node) { - - node.SetPtr(Node::GetAllocator(art, NType::PREFIX_SEGMENT).New()); - node.type = (uint8_t)NType::PREFIX_SEGMENT; - - auto &segment = PrefixSegment::Get(art, node); - segment.next.Reset(); - return segment; -} - -PrefixSegment &PrefixSegment::Append(ART &art, uint32_t &count, const uint8_t byte) { - - reference segment(*this); - auto position = count % Node::PREFIX_SEGMENT_SIZE; - - // we need a new segment - if (position == 0 && count != 0) { - segment = PrefixSegment::New(art, next); - } - - segment.get().bytes[position] = byte; - count++; - return segment.get(); -} - -PrefixSegment &PrefixSegment::GetTail(const ART &art) { - - reference segment(*this); - while (segment.get().next.IsSet()) { - segment = PrefixSegment::Get(art, segment.get().next); - } - return segment.get(); -} - -} // namespace duckdb diff --git a/src/duckdb/src/execution/index/art/swizzleable_pointer.cpp b/src/duckdb/src/execution/index/art/swizzleable_pointer.cpp deleted file mode 100644 index 626b5dd76..000000000 --- a/src/duckdb/src/execution/index/art/swizzleable_pointer.cpp +++ /dev/null @@ -1,22 +0,0 @@ -#include "duckdb/execution/index/art/swizzleable_pointer.hpp" - -#include "duckdb/storage/meta_block_reader.hpp" - -namespace duckdb { - -SwizzleablePointer::SwizzleablePointer(MetaBlockReader &reader) { - - idx_t block_id = reader.Read(); - offset = reader.Read(); - type = 0; - - if (block_id == DConstants::INVALID_INDEX) { - swizzle_flag = 0; - return; - } - - buffer_id = (uint32_t)block_id; - swizzle_flag = 1; -} - -} // namespace duckdb diff --git a/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp b/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp deleted file mode 100644 index 073e11df5..000000000 --- a/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +++ /dev/null @@ -1,695 +0,0 @@ -#include "duckdb/execution/operator/persistent/base_csv_reader.hpp" -#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" -#include "duckdb/common/file_system.hpp" -#include "duckdb/common/string_util.hpp" -#include "duckdb/common/to_string.hpp" -#include "duckdb/common/types/cast_helpers.hpp" -#include "duckdb/common/operator/cast_operators.hpp" -#include "duckdb/common/operator/decimal_cast_operators.hpp" -#include "duckdb/common/vector_operations/unary_executor.hpp" -#include "duckdb/common/vector_operations/vector_operations.hpp" -#include "duckdb/function/scalar/strftime_format.hpp" -#include "duckdb/main/appender.hpp" -#include "duckdb/main/database.hpp" -#include "duckdb/parser/column_definition.hpp" -#include "duckdb/storage/data_table.hpp" -#include "utf8proc_wrapper.hpp" -#include "utf8proc.hpp" -#include "duckdb/parser/keyword_helper.hpp" -#include "duckdb/main/error_manager.hpp" -#include "duckdb/execution/operator/persistent/parallel_csv_reader.hpp" -#include "duckdb/execution/operator/persistent/csv_rejects_table.hpp" -#include "duckdb/main/client_data.hpp" -#include -#include -#include -#include - -namespace duckdb { - -string BaseCSVReader::GetLineNumberStr(idx_t line_error, bool is_line_estimated, idx_t buffer_idx) { - // If an error happens during auto-detect it is an estimated line - string estimated = (is_line_estimated ? string(" (estimated)") : string("")); - return to_string(GetLineError(line_error, buffer_idx)) + estimated; -} - -BaseCSVReader::BaseCSVReader(ClientContext &context_p, BufferedCSVReaderOptions options_p, - const vector &requested_types) - : context(context_p), fs(FileSystem::GetFileSystem(context)), allocator(BufferAllocator::Get(context)), - options(std::move(options_p)) { -} - -BaseCSVReader::~BaseCSVReader() { -} - -unique_ptr BaseCSVReader::OpenCSV(const BufferedCSVReaderOptions &options_p) { - return CSVFileHandle::OpenFile(fs, allocator, options_p.file_path, options_p.compression, true); -} - -void BaseCSVReader::InitParseChunk(idx_t num_cols) { - // adapt not null info - if (options.force_not_null.size() != num_cols) { - options.force_not_null.resize(num_cols, false); - } - if (num_cols == parse_chunk.ColumnCount()) { - parse_chunk.Reset(); - } else { - parse_chunk.Destroy(); - - // initialize the parse_chunk with a set of VARCHAR types - vector varchar_types(num_cols, LogicalType::VARCHAR); - parse_chunk.Initialize(allocator, varchar_types); - } -} - -void BaseCSVReader::InitializeProjection() { - for (idx_t i = 0; i < GetTypes().size(); i++) { - reader_data.column_ids.push_back(i); - reader_data.column_mapping.push_back(i); - } -} - -void BaseCSVReader::SetDateFormat(const string &format_specifier, const LogicalTypeId &sql_type) { - options.has_format[sql_type] = true; - auto &date_format = options.date_format[sql_type]; - date_format.format_specifier = format_specifier; - StrTimeFormat::ParseFormatSpecifier(date_format.format_specifier, date_format); -} - -struct TryCastDecimalOperator { - template - static bool Operation(string_t input, uint8_t width, uint8_t scale) { - T result; - string error_message; - return OP::Operation(input, result, &error_message, width, scale); - } -}; - -struct TryCastFloatingOperator { - template - static bool Operation(string_t input) { - T result; - string error_message; - return OP::Operation(input, result, &error_message); - } -}; - -bool TryCastDecimalValueCommaSeparated(const string_t &value_str, const LogicalType &sql_type) { - auto width = DecimalType::GetWidth(sql_type); - auto scale = DecimalType::GetScale(sql_type); - switch (sql_type.InternalType()) { - case PhysicalType::INT16: - return TryCastDecimalOperator::Operation(value_str, width, scale); - case PhysicalType::INT32: - return TryCastDecimalOperator::Operation(value_str, width, scale); - case PhysicalType::INT64: - return TryCastDecimalOperator::Operation(value_str, width, scale); - case PhysicalType::INT128: - return TryCastDecimalOperator::Operation(value_str, width, scale); - default: - throw InternalException("Unimplemented physical type for decimal"); - } -} - -bool TryCastFloatingValueCommaSeparated(const string_t &value_str, const LogicalType &sql_type) { - switch (sql_type.InternalType()) { - case PhysicalType::DOUBLE: - return TryCastFloatingOperator::Operation(value_str); - case PhysicalType::FLOAT: - return TryCastFloatingOperator::Operation(value_str); - default: - throw InternalException("Unimplemented physical type for floating"); - } -} - -bool BaseCSVReader::TryCastValue(const Value &value, const LogicalType &sql_type) { - if (value.IsNull()) { - return true; - } - if (options.has_format[LogicalTypeId::DATE] && sql_type.id() == LogicalTypeId::DATE) { - date_t result; - string error_message; - return options.date_format[LogicalTypeId::DATE].TryParseDate(string_t(StringValue::Get(value)), result, - error_message); - } else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type.id() == LogicalTypeId::TIMESTAMP) { - timestamp_t result; - string error_message; - return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(string_t(StringValue::Get(value)), - result, error_message); - } else if (options.decimal_separator != "." && sql_type.id() == LogicalTypeId::DECIMAL) { - return TryCastDecimalValueCommaSeparated(string_t(StringValue::Get(value)), sql_type); - } else if (options.decimal_separator != "." && - ((sql_type.id() == LogicalTypeId::FLOAT) || (sql_type.id() == LogicalTypeId::DOUBLE))) { - return TryCastFloatingValueCommaSeparated(string_t(StringValue::Get(value)), sql_type); - } else { - Value new_value; - string error_message; - return value.TryCastAs(context, sql_type, new_value, &error_message, true); - } -} - -struct TryCastDateOperator { - static bool Operation(BufferedCSVReaderOptions &options, string_t input, date_t &result, string &error_message) { - return options.date_format[LogicalTypeId::DATE].TryParseDate(input, result, error_message); - } -}; - -struct TryCastTimestampOperator { - static bool Operation(BufferedCSVReaderOptions &options, string_t input, timestamp_t &result, - string &error_message) { - return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(input, result, error_message); - } -}; - -template -static bool TemplatedTryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, - idx_t count, string &error_message, idx_t &line_error) { - D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR); - bool all_converted = true; - idx_t cur_line = 0; - UnaryExecutor::Execute(input_vector, result_vector, count, [&](string_t input) { - T result; - if (!OP::Operation(options, input, result, error_message)) { - line_error = cur_line; - all_converted = false; - } - cur_line++; - return result; - }); - return all_converted; -} - -bool TryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count, - string &error_message, idx_t &line_error) { - return TemplatedTryCastDateVector(options, input_vector, result_vector, count, - error_message, line_error); -} - -bool TryCastTimestampVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count, - string &error_message) { - idx_t line_error; - return TemplatedTryCastDateVector(options, input_vector, result_vector, - count, error_message, line_error); -} - -template -bool TemplatedTryCastFloatingVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, - idx_t count, string &error_message, idx_t &line_error) { - D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR); - bool all_converted = true; - idx_t row = 0; - UnaryExecutor::Execute(input_vector, result_vector, count, [&](string_t input) { - T result; - if (!OP::Operation(input, result, &error_message)) { - line_error = row; - all_converted = false; - } else { - row++; - } - return result; - }); - return all_converted; -} - -template -bool TemplatedTryCastDecimalVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, - idx_t count, string &error_message, uint8_t width, uint8_t scale) { - D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR); - bool all_converted = true; - UnaryExecutor::Execute(input_vector, result_vector, count, [&](string_t input) { - T result; - if (!OP::Operation(input, result, &error_message, width, scale)) { - all_converted = false; - } - return result; - }); - return all_converted; -} - -bool BaseCSVReader::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) { - // try vector-cast from string to sql_type - Vector dummy_result(sql_type); - if (options.has_format[LogicalTypeId::DATE] && sql_type == LogicalTypeId::DATE) { - // use the date format to cast the chunk - string error_message; - idx_t line_error; - return TryCastDateVector(options, parse_chunk_col, dummy_result, size, error_message, line_error); - } else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type == LogicalTypeId::TIMESTAMP) { - // use the timestamp format to cast the chunk - string error_message; - return TryCastTimestampVector(options, parse_chunk_col, dummy_result, size, error_message); - } else { - // target type is not varchar: perform a cast - string error_message; - return VectorOperations::DefaultTryCast(parse_chunk_col, dummy_result, size, &error_message, true); - } -} - -void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector &escape_positions, bool has_quotes, - idx_t buffer_idx) { - auto length = str_val.GetSize(); - if (length == 0 && column == 0) { - row_empty = true; - } else { - row_empty = false; - } - if (!return_types.empty() && column == return_types.size() && length == 0) { - // skip a single trailing delimiter in last column - return; - } - if (mode == ParserMode::SNIFFING_DIALECT) { - column++; - return; - } - if (column >= return_types.size()) { - if (options.ignore_errors) { - error_column_overflow = true; - return; - } else { - throw InvalidInputException( - "Error in file \"%s\", on line %s: expected %lld values per row, but got more. (%s)", options.file_path, - GetLineNumberStr(linenr, linenr_estimated, buffer_idx).c_str(), return_types.size(), - options.ToString()); - } - } - - // insert the line number into the chunk - idx_t row_entry = parse_chunk.size(); - - // test against null string, but only if the value was not quoted - if ((!(has_quotes && !options.allow_quoted_nulls) || return_types[column].id() != LogicalTypeId::VARCHAR) && - !options.force_not_null[column] && Equals::Operation(str_val, string_t(options.null_str))) { - FlatVector::SetNull(parse_chunk.data[column], row_entry, true); - } else { - auto &v = parse_chunk.data[column]; - auto parse_data = FlatVector::GetData(v); - if (!escape_positions.empty()) { - // remove escape characters (if any) - string old_val = str_val.GetString(); - string new_val = ""; - idx_t prev_pos = 0; - for (idx_t i = 0; i < escape_positions.size(); i++) { - idx_t next_pos = escape_positions[i]; - new_val += old_val.substr(prev_pos, next_pos - prev_pos); - - if (options.escape.empty() || options.escape == options.quote) { - prev_pos = next_pos + options.quote.size(); - } else { - prev_pos = next_pos + options.escape.size(); - } - } - new_val += old_val.substr(prev_pos, old_val.size() - prev_pos); - escape_positions.clear(); - parse_data[row_entry] = StringVector::AddStringOrBlob(v, string_t(new_val)); - } else { - parse_data[row_entry] = str_val; - } - } - - // move to the next column - column++; -} - -bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error_message, idx_t buffer_idx) { - linenr++; - - if (row_empty) { - row_empty = false; - if (return_types.size() != 1) { - if (mode == ParserMode::PARSING) { - FlatVector::SetNull(parse_chunk.data[0], parse_chunk.size(), false); - } - column = 0; - return false; - } - } - - // Error forwarded by 'ignore_errors' - originally encountered in 'AddValue' - if (error_column_overflow) { - D_ASSERT(options.ignore_errors); - error_column_overflow = false; - column = 0; - return false; - } - - if (column < return_types.size() && mode != ParserMode::SNIFFING_DIALECT) { - if (options.null_padding) { - for (; column < return_types.size(); column++) { - FlatVector::SetNull(parse_chunk.data[column], parse_chunk.size(), true); - } - } else if (options.ignore_errors) { - column = 0; - return false; - } else { - if (mode == ParserMode::SNIFFING_DATATYPES) { - error_message = "Error when adding line"; - return false; - } else { - throw InvalidInputException( - "Error in file \"%s\" on line %s: expected %lld values per row, but got %d.\nParser options:\n%s", - options.file_path, GetLineNumberStr(linenr, linenr_estimated, buffer_idx).c_str(), - return_types.size(), column, options.ToString()); - } - } - } - - if (mode == ParserMode::SNIFFING_DIALECT) { - sniffed_column_counts.push_back(column); - - if (sniffed_column_counts.size() == options.sample_chunk_size) { - return true; - } - } else { - parse_chunk.SetCardinality(parse_chunk.size() + 1); - } - - if (mode == ParserMode::PARSING_HEADER) { - return true; - } - - if (mode == ParserMode::SNIFFING_DATATYPES && parse_chunk.size() == options.sample_chunk_size) { - return true; - } - - if (mode == ParserMode::PARSING && parse_chunk.size() == STANDARD_VECTOR_SIZE) { - Flush(insert_chunk, buffer_idx); - return true; - } - - column = 0; - return false; -} - -void BaseCSVReader::VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset) { - D_ASSERT(col_idx < chunk.data.size()); - D_ASSERT(row_idx < chunk.size()); - auto &v = chunk.data[col_idx]; - if (FlatVector::IsNull(v, row_idx)) { - return; - } - - auto parse_data = FlatVector::GetData(chunk.data[col_idx]); - auto s = parse_data[row_idx]; - auto utf_type = Utf8Proc::Analyze(s.GetData(), s.GetSize()); - if (utf_type == UnicodeType::INVALID) { - string col_name = to_string(col_idx); - if (col_idx < names.size()) { - col_name = "\"" + names[col_idx] + "\""; - } - int64_t error_line = linenr - (chunk.size() - row_idx) + 1 + offset; - D_ASSERT(error_line >= 0); - throw InvalidInputException("Error in file \"%s\" at line %llu in column \"%s\": " - "%s. Parser options:\n%s", - options.file_path, error_line, col_name, - ErrorManager::InvalidUnicodeError(s.GetString(), "CSV file"), options.ToString()); - } -} - -void BaseCSVReader::VerifyUTF8(idx_t col_idx) { - D_ASSERT(col_idx < parse_chunk.data.size()); - for (idx_t i = 0; i < parse_chunk.size(); i++) { - VerifyUTF8(col_idx, i, parse_chunk); - } -} - -bool TryCastDecimalVectorCommaSeparated(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, - idx_t count, string &error_message, const LogicalType &result_type) { - auto width = DecimalType::GetWidth(result_type); - auto scale = DecimalType::GetScale(result_type); - switch (result_type.InternalType()) { - case PhysicalType::INT16: - return TemplatedTryCastDecimalVector( - options, input_vector, result_vector, count, error_message, width, scale); - case PhysicalType::INT32: - return TemplatedTryCastDecimalVector( - options, input_vector, result_vector, count, error_message, width, scale); - case PhysicalType::INT64: - return TemplatedTryCastDecimalVector( - options, input_vector, result_vector, count, error_message, width, scale); - case PhysicalType::INT128: - return TemplatedTryCastDecimalVector( - options, input_vector, result_vector, count, error_message, width, scale); - default: - throw InternalException("Unimplemented physical type for decimal"); - } -} - -bool TryCastFloatingVectorCommaSeparated(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, - idx_t count, string &error_message, const LogicalType &result_type, - idx_t &line_error) { - switch (result_type.InternalType()) { - case PhysicalType::DOUBLE: - return TemplatedTryCastFloatingVector( - options, input_vector, result_vector, count, error_message, line_error); - case PhysicalType::FLOAT: - return TemplatedTryCastFloatingVector( - options, input_vector, result_vector, count, error_message, line_error); - default: - throw InternalException("Unimplemented physical type for floating"); - } -} - -// Location of erroneous value in the current parse chunk -struct ErrorLocation { - idx_t row_idx; - idx_t col_idx; - idx_t row_line; - - ErrorLocation(idx_t row_idx, idx_t col_idx, idx_t row_line) - : row_idx(row_idx), col_idx(col_idx), row_line(row_line) { - } -}; - -bool BaseCSVReader::Flush(DataChunk &insert_chunk, idx_t buffer_idx, bool try_add_line) { - if (parse_chunk.size() == 0) { - return true; - } - - bool conversion_error_ignored = false; - - // convert the columns in the parsed chunk to the types of the table - insert_chunk.SetCardinality(parse_chunk); - if (reader_data.column_ids.empty() && !reader_data.empty_columns) { - throw InternalException("BaseCSVReader::Flush called on a CSV reader that was not correctly initialized. Call " - "MultiFileReader::InitializeReader or InitializeProjection"); - } - D_ASSERT(reader_data.column_ids.size() == reader_data.column_mapping.size()); - for (idx_t c = 0; c < reader_data.column_ids.size(); c++) { - auto col_idx = reader_data.column_ids[c]; - auto result_idx = reader_data.column_mapping[c]; - auto &parse_vector = parse_chunk.data[col_idx]; - auto &result_vector = insert_chunk.data[result_idx]; - auto &type = result_vector.GetType(); - if (type.id() == LogicalTypeId::VARCHAR) { - // target type is varchar: no need to convert - // just test that all strings are valid utf-8 strings - VerifyUTF8(col_idx); - // reinterpret rather than reference so we can deal with user-defined types - result_vector.Reinterpret(parse_vector); - } else { - string error_message; - bool success; - idx_t line_error = 0; - bool target_type_not_varchar = false; - if (options.has_format[LogicalTypeId::DATE] && type.id() == LogicalTypeId::DATE) { - // use the date format to cast the chunk - success = TryCastDateVector(options, parse_vector, result_vector, parse_chunk.size(), error_message, - line_error); - } else if (options.has_format[LogicalTypeId::TIMESTAMP] && type.id() == LogicalTypeId::TIMESTAMP) { - // use the date format to cast the chunk - success = - TryCastTimestampVector(options, parse_vector, result_vector, parse_chunk.size(), error_message); - } else if (options.decimal_separator != "." && - (type.id() == LogicalTypeId::FLOAT || type.id() == LogicalTypeId::DOUBLE)) { - success = TryCastFloatingVectorCommaSeparated(options, parse_vector, result_vector, parse_chunk.size(), - error_message, type, line_error); - } else if (options.decimal_separator != "." && type.id() == LogicalTypeId::DECIMAL) { - success = TryCastDecimalVectorCommaSeparated(options, parse_vector, result_vector, parse_chunk.size(), - error_message, type); - } else { - // target type is not varchar: perform a cast - target_type_not_varchar = true; - success = - VectorOperations::TryCast(context, parse_vector, result_vector, parse_chunk.size(), &error_message); - } - if (success) { - continue; - } - if (try_add_line) { - return false; - } - - string col_name = to_string(col_idx); - if (col_idx < names.size()) { - col_name = "\"" + names[col_idx] + "\""; - } - - // figure out the exact line number - if (target_type_not_varchar) { - UnifiedVectorFormat inserted_column_data; - result_vector.ToUnifiedFormat(parse_chunk.size(), inserted_column_data); - for (; line_error < parse_chunk.size(); line_error++) { - if (!inserted_column_data.validity.RowIsValid(line_error) && - !FlatVector::IsNull(parse_vector, line_error)) { - break; - } - } - } - - // The line_error must be summed with linenr (All lines emmited from this batch) - // But subtracted from the parse_chunk - D_ASSERT(line_error + linenr >= parse_chunk.size()); - line_error += linenr; - line_error -= parse_chunk.size(); - - auto error_line = GetLineError(line_error, buffer_idx); - - if (options.ignore_errors) { - conversion_error_ignored = true; - - } else if (options.auto_detect) { - throw InvalidInputException("%s in column %s, at line %llu.\n\nParser " - "options:\n%s.\n\nConsider either increasing the sample size " - "(SAMPLE_SIZE=X [X rows] or SAMPLE_SIZE=-1 [all rows]), " - "or skipping column conversion (ALL_VARCHAR=1)", - error_message, col_name, error_line, options.ToString()); - } else { - throw InvalidInputException("%s at line %llu in column %s. Parser options:\n%s ", error_message, - error_line, col_name, options.ToString()); - } - } - } - if (conversion_error_ignored) { - D_ASSERT(options.ignore_errors); - - SelectionVector succesful_rows(parse_chunk.size()); - idx_t sel_size = 0; - - // Keep track of failed cells - vector failed_cells; - - for (idx_t row_idx = 0; row_idx < parse_chunk.size(); row_idx++) { - - auto global_row_idx = row_idx + linenr - parse_chunk.size(); - auto row_line = GetLineError(global_row_idx, buffer_idx, false); - - bool row_failed = false; - for (idx_t c = 0; c < reader_data.column_ids.size(); c++) { - auto col_idx = reader_data.column_ids[c]; - auto result_idx = reader_data.column_mapping[c]; - - auto &parse_vector = parse_chunk.data[col_idx]; - auto &result_vector = insert_chunk.data[result_idx]; - - bool was_already_null = FlatVector::IsNull(parse_vector, row_idx); - if (!was_already_null && FlatVector::IsNull(result_vector, row_idx)) { - row_failed = true; - failed_cells.emplace_back(row_idx, col_idx, row_line); - } - } - if (!row_failed) { - succesful_rows.set_index(sel_size++, row_idx); - } - } - - // Now do a second pass to produce the reject table entries - if (!failed_cells.empty() && !options.rejects_table_name.empty()) { - auto limit = options.rejects_limit; - - auto rejects = CSVRejectsTable::GetOrCreate(context, options.rejects_table_name); - lock_guard lock(rejects->write_lock); - - // short circuit if we already have too many rejects - if (limit == 0 || rejects->count < limit) { - auto &table = rejects->GetTable(context); - InternalAppender appender(context, table); - auto file_name = GetFileName(); - - for (auto &cell : failed_cells) { - if (limit != 0 && rejects->count >= limit) { - break; - } - rejects->count++; - - auto row_idx = cell.row_idx; - auto col_idx = cell.col_idx; - auto row_line = cell.row_line; - - auto col_name = to_string(col_idx); - if (col_idx < names.size()) { - col_name = "\"" + names[col_idx] + "\""; - } - - auto &parse_vector = parse_chunk.data[col_idx]; - auto parsed_str = FlatVector::GetData(parse_vector)[row_idx]; - auto &type = insert_chunk.data[col_idx].GetType(); - auto row_error_msg = StringUtil::Format("Could not convert string '%s' to '%s'", - parsed_str.GetString(), type.ToString()); - - // Add the row to the rejects table - appender.BeginRow(); - appender.Append(string_t(file_name)); - appender.Append(row_line); - appender.Append(col_idx); - appender.Append(string_t(col_name)); - appender.Append(parsed_str); - - if (!options.rejects_recovery_columns.empty()) { - child_list_t recovery_key; - for (auto &key_idx : options.rejects_recovery_column_ids) { - // Figure out if the recovery key is valid. - // If not, error out for real. - auto &component_vector = parse_chunk.data[key_idx]; - if (FlatVector::IsNull(component_vector, row_idx)) { - throw InvalidInputException("%s at line %llu in column %s. Parser options:\n%s ", - "Could not parse recovery column", row_line, col_name, - options.ToString()); - } - auto component = Value(FlatVector::GetData(component_vector)[row_idx]); - recovery_key.emplace_back(names[key_idx], component); - } - appender.Append(Value::STRUCT(recovery_key)); - } - - appender.Append(string_t(row_error_msg)); - appender.EndRow(); - } - appender.Close(); - } - } - - // Now slice the insert chunk to only include the succesful rows - insert_chunk.Slice(succesful_rows, sel_size); - } - parse_chunk.Reset(); - return true; -} - -void BaseCSVReader::SetNewLineDelimiter(bool carry, bool carry_followed_by_nl) { - if ((mode == ParserMode::SNIFFING_DIALECT && !options.has_newline) || - options.new_line == NewLineIdentifier::NOT_SET) { - if (options.new_line == NewLineIdentifier::MIX) { - return; - } - NewLineIdentifier this_line_identifier; - if (carry) { - if (carry_followed_by_nl) { - this_line_identifier = NewLineIdentifier::CARRY_ON; - } else { - this_line_identifier = NewLineIdentifier::SINGLE; - } - } else { - this_line_identifier = NewLineIdentifier::SINGLE; - } - if (options.new_line == NewLineIdentifier::NOT_SET) { - options.new_line = this_line_identifier; - return; - } - if (options.new_line != this_line_identifier) { - options.new_line = NewLineIdentifier::MIX; - return; - } - options.new_line = this_line_identifier; - } -} -} // namespace duckdb diff --git a/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp b/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp deleted file mode 100644 index d8ac2b972..000000000 --- a/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +++ /dev/null @@ -1,1487 +0,0 @@ -#include "duckdb/execution/operator/persistent/buffered_csv_reader.hpp" - -#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" -#include "duckdb/common/file_system.hpp" -#include "duckdb/common/string_util.hpp" -#include "duckdb/common/to_string.hpp" -#include "duckdb/common/types/cast_helpers.hpp" -#include "duckdb/common/vector_operations/unary_executor.hpp" -#include "duckdb/common/vector_operations/vector_operations.hpp" -#include "duckdb/function/scalar/strftime_format.hpp" -#include "duckdb/main/database.hpp" -#include "duckdb/parser/column_definition.hpp" -#include "duckdb/storage/data_table.hpp" -#include "utf8proc_wrapper.hpp" -#include "utf8proc.hpp" -#include "duckdb/parser/keyword_helper.hpp" -#include "duckdb/main/error_manager.hpp" -#include "duckdb/main/client_data.hpp" - -#include -#include -#include -#include - -namespace duckdb { - -BufferedCSVReader::BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p, - const vector &requested_types) - : BaseCSVReader(context, std::move(options_p), requested_types), buffer_size(0), position(0), start(0) { - file_handle = OpenCSV(options); - Initialize(requested_types); -} - -BufferedCSVReader::BufferedCSVReader(ClientContext &context, string filename, BufferedCSVReaderOptions options_p, - const vector &requested_types) - : BaseCSVReader(context, std::move(options_p), requested_types), buffer_size(0), position(0), start(0) { - options.file_path = std::move(filename); - file_handle = OpenCSV(options); - Initialize(requested_types); -} - -enum class QuoteRule : uint8_t { QUOTES_RFC = 0, QUOTES_OTHER = 1, NO_QUOTES = 2 }; - -static bool StartsWithNumericDate(string &separator, const string &value) { - auto begin = value.c_str(); - auto end = begin + value.size(); - - // StrpTimeFormat::Parse will skip whitespace, so we can too - auto field1 = std::find_if_not(begin, end, StringUtil::CharacterIsSpace); - if (field1 == end) { - return false; - } - - // first numeric field must start immediately - if (!StringUtil::CharacterIsDigit(*field1)) { - return false; - } - auto literal1 = std::find_if_not(field1, end, StringUtil::CharacterIsDigit); - if (literal1 == end) { - return false; - } - - // second numeric field must exist - auto field2 = std::find_if(literal1, end, StringUtil::CharacterIsDigit); - if (field2 == end) { - return false; - } - auto literal2 = std::find_if_not(field2, end, StringUtil::CharacterIsDigit); - if (literal2 == end) { - return false; - } - - // third numeric field must exist - auto field3 = std::find_if(literal2, end, StringUtil::CharacterIsDigit); - if (field3 == end) { - return false; - } - - // second literal must match first - if (((field3 - literal2) != (field2 - literal1)) || strncmp(literal1, literal2, (field2 - literal1)) != 0) { - return false; - } - - // copy the literal as the separator, escaping percent signs - separator.clear(); - while (literal1 < field2) { - const auto literal_char = *literal1++; - if (literal_char == '%') { - separator.push_back(literal_char); - } - separator.push_back(literal_char); - } - - return true; -} - -string GenerateDateFormat(const string &separator, const char *format_template) { - string format_specifier = format_template; - auto amount_of_dashes = std::count(format_specifier.begin(), format_specifier.end(), '-'); - if (!amount_of_dashes) { - return format_specifier; - } - string result; - result.reserve(format_specifier.size() - amount_of_dashes + (amount_of_dashes * separator.size())); - for (auto &character : format_specifier) { - if (character == '-') { - result += separator; - } else { - result += character; - } - } - return result; -} - -TextSearchShiftArray::TextSearchShiftArray() { -} - -TextSearchShiftArray::TextSearchShiftArray(string search_term) : length(search_term.size()) { - if (length > 255) { - throw InvalidInputException("Size of delimiter/quote/escape in CSV reader is limited to 255 bytes"); - } - // initialize the shifts array - shifts = unique_ptr(new uint8_t[length * 255]); - memset(shifts.get(), 0, length * 255 * sizeof(uint8_t)); - // iterate over each of the characters in the array - for (idx_t main_idx = 0; main_idx < length; main_idx++) { - uint8_t current_char = (uint8_t)search_term[main_idx]; - // now move over all the remaining positions - for (idx_t i = main_idx; i < length; i++) { - bool is_match = true; - // check if the prefix matches at this position - // if it does, we move to this position after encountering the current character - for (idx_t j = 0; j < main_idx; j++) { - if (search_term[i - main_idx + j] != search_term[j]) { - is_match = false; - } - } - if (!is_match) { - continue; - } - shifts[i * 255 + current_char] = main_idx + 1; - } - } -} - -// Helper function to generate column names -static string GenerateColumnName(const idx_t total_cols, const idx_t col_number, const string &prefix = "column") { - int max_digits = NumericHelper::UnsignedLength(total_cols - 1); - int digits = NumericHelper::UnsignedLength(col_number); - string leading_zeros = string(max_digits - digits, '0'); - string value = to_string(col_number); - return string(prefix + leading_zeros + value); -} - -// Helper function for UTF-8 aware space trimming -static string TrimWhitespace(const string &col_name) { - utf8proc_int32_t codepoint; - auto str = reinterpret_cast(col_name.c_str()); - idx_t size = col_name.size(); - // Find the first character that is not left trimmed - idx_t begin = 0; - while (begin < size) { - auto bytes = utf8proc_iterate(str + begin, size - begin, &codepoint); - D_ASSERT(bytes > 0); - if (utf8proc_category(codepoint) != UTF8PROC_CATEGORY_ZS) { - break; - } - begin += bytes; - } - - // Find the last character that is not right trimmed - idx_t end; - end = begin; - for (auto next = begin; next < col_name.size();) { - auto bytes = utf8proc_iterate(str + next, size - next, &codepoint); - D_ASSERT(bytes > 0); - next += bytes; - if (utf8proc_category(codepoint) != UTF8PROC_CATEGORY_ZS) { - end = next; - } - } - - // return the trimmed string - return col_name.substr(begin, end - begin); -} - -static string NormalizeColumnName(const string &col_name) { - // normalize UTF8 characters to NFKD - auto nfkd = utf8proc_NFKD(reinterpret_cast(col_name.c_str()), col_name.size()); - const string col_name_nfkd = string(const_char_ptr_cast(nfkd), strlen(const_char_ptr_cast(nfkd))); - free(nfkd); - - // only keep ASCII characters 0-9 a-z A-Z and replace spaces with regular whitespace - string col_name_ascii = ""; - for (idx_t i = 0; i < col_name_nfkd.size(); i++) { - if (col_name_nfkd[i] == '_' || (col_name_nfkd[i] >= '0' && col_name_nfkd[i] <= '9') || - (col_name_nfkd[i] >= 'A' && col_name_nfkd[i] <= 'Z') || - (col_name_nfkd[i] >= 'a' && col_name_nfkd[i] <= 'z')) { - col_name_ascii += col_name_nfkd[i]; - } else if (StringUtil::CharacterIsSpace(col_name_nfkd[i])) { - col_name_ascii += " "; - } - } - - // trim whitespace and replace remaining whitespace by _ - string col_name_trimmed = TrimWhitespace(col_name_ascii); - string col_name_cleaned = ""; - bool in_whitespace = false; - for (idx_t i = 0; i < col_name_trimmed.size(); i++) { - if (col_name_trimmed[i] == ' ') { - if (!in_whitespace) { - col_name_cleaned += "_"; - in_whitespace = true; - } - } else { - col_name_cleaned += col_name_trimmed[i]; - in_whitespace = false; - } - } - - // don't leave string empty; if not empty, make lowercase - if (col_name_cleaned.empty()) { - col_name_cleaned = "_"; - } else { - col_name_cleaned = StringUtil::Lower(col_name_cleaned); - } - - // prepend _ if name starts with a digit or is a reserved keyword - if (KeywordHelper::IsKeyword(col_name_cleaned) || (col_name_cleaned[0] >= '0' && col_name_cleaned[0] <= '9')) { - col_name_cleaned = "_" + col_name_cleaned; - } - return col_name_cleaned; -} - -void BufferedCSVReader::Initialize(const vector &requested_types) { - PrepareComplexParser(); - if (options.auto_detect) { - return_types = SniffCSV(requested_types); - if (return_types.empty()) { - throw InvalidInputException("Failed to detect column types from CSV: is the file a valid CSV file?"); - } - JumpToBeginning(options.skip_rows, options.header); - } else { - return_types = requested_types; - ResetBuffer(); - SkipRowsAndReadHeader(options.skip_rows, options.header); - } - InitParseChunk(return_types.size()); -} - -void BufferedCSVReader::ResetBuffer() { - buffer.reset(); - buffer_size = 0; - position = 0; - start = 0; - cached_buffers.clear(); -} - -void BufferedCSVReader::ResetStream() { - file_handle->Reset(); - linenr = 0; - linenr_estimated = false; - bytes_per_line_avg = 0; - sample_chunk_idx = 0; - jumping_samples = false; -} - -void BufferedCSVReader::JumpToBeginning(idx_t skip_rows = 0, bool skip_header = false) { - ResetBuffer(); - ResetStream(); - sample_chunk_idx = 0; - bytes_in_chunk = 0; - end_of_file_reached = false; - bom_checked = false; - SkipRowsAndReadHeader(skip_rows, skip_header); -} - -void BufferedCSVReader::SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header) { - for (idx_t i = 0; i < skip_rows; i++) { - // ignore skip rows - string read_line = file_handle->ReadLine(); - linenr++; - } - - if (skip_header) { - // ignore the first line as a header line - InitParseChunk(return_types.size()); - ParseCSV(ParserMode::PARSING_HEADER); - } -} - -void BufferedCSVReader::PrepareComplexParser() { - delimiter_search = TextSearchShiftArray(options.delimiter); - escape_search = TextSearchShiftArray(options.escape); - quote_search = TextSearchShiftArray(options.quote); -} - -bool BufferedCSVReader::JumpToNextSample() { - // get bytes contained in the previously read chunk - idx_t remaining_bytes_in_buffer = buffer_size - start; - bytes_in_chunk -= remaining_bytes_in_buffer; - if (remaining_bytes_in_buffer == 0) { - return false; - } - - // assess if it makes sense to jump, based on size of the first chunk relative to size of the entire file - if (sample_chunk_idx == 0) { - idx_t bytes_first_chunk = bytes_in_chunk; - double chunks_fit = (file_handle->FileSize() / (double)bytes_first_chunk); - jumping_samples = chunks_fit >= options.sample_chunks; - - // jump back to the beginning - JumpToBeginning(options.skip_rows, options.header); - sample_chunk_idx++; - return true; - } - - if (end_of_file_reached || sample_chunk_idx >= options.sample_chunks) { - return false; - } - - // if we deal with any other sources than plaintext files, jumping_samples can be tricky. In that case - // we just read x continuous chunks from the stream TODO: make jumps possible for zipfiles. - if (!file_handle->OnDiskFile() || !jumping_samples) { - sample_chunk_idx++; - return true; - } - - // update average bytes per line - double bytes_per_line = bytes_in_chunk / (double)options.sample_chunk_size; - bytes_per_line_avg = ((bytes_per_line_avg * (sample_chunk_idx)) + bytes_per_line) / (sample_chunk_idx + 1); - - // if none of the previous conditions were met, we can jump - idx_t partition_size = (idx_t)round(file_handle->FileSize() / (double)options.sample_chunks); - - // calculate offset to end of the current partition - int64_t offset = partition_size - bytes_in_chunk - remaining_bytes_in_buffer; - auto current_pos = file_handle->SeekPosition(); - - if (current_pos + offset < file_handle->FileSize()) { - // set position in stream and clear failure bits - file_handle->Seek(current_pos + offset); - - // estimate linenr - linenr += (idx_t)round((offset + remaining_bytes_in_buffer) / bytes_per_line_avg); - linenr_estimated = true; - } else { - // seek backwards from the end in last chunk and hope to catch the end of the file - // TODO: actually it would be good to make sure that the end of file is being reached, because - // messy end-lines are quite common. For this case, however, we first need a skip_end detection anyways. - file_handle->Seek(file_handle->FileSize() - bytes_in_chunk); - - // estimate linenr - linenr = (idx_t)round((file_handle->FileSize() - bytes_in_chunk) / bytes_per_line_avg); - linenr_estimated = true; - } - - // reset buffers and parse chunk - ResetBuffer(); - - // seek beginning of next line - // FIXME: if this jump ends up in a quoted linebreak, we will have a problem - string read_line = file_handle->ReadLine(); - linenr++; - - sample_chunk_idx++; - - return true; -} - -void BufferedCSVReader::DetectDialect(const vector &requested_types, - BufferedCSVReaderOptions &original_options, - vector &info_candidates, idx_t &best_num_cols) { - // set up the candidates we consider for delimiter and quote rules based on user input - vector delim_candidates; - vector quoterule_candidates; - vector> quote_candidates_map; - vector> escape_candidates_map = {{""}, {"\\"}, {""}}; - - if (options.has_delimiter) { - // user provided a delimiter: use that delimiter - delim_candidates = {options.delimiter}; - } else { - // no delimiter provided: try standard/common delimiters - delim_candidates = {",", "|", ";", "\t"}; - } - if (options.has_quote) { - // user provided quote: use that quote rule - quote_candidates_map = {{options.quote}, {options.quote}, {options.quote}}; - } else { - // no quote rule provided: use standard/common quotes - quote_candidates_map = {{"\""}, {"\"", "'"}, {""}}; - } - if (options.has_escape) { - // user provided escape: use that escape rule - if (options.escape.empty()) { - quoterule_candidates = {QuoteRule::QUOTES_RFC}; - } else { - quoterule_candidates = {QuoteRule::QUOTES_OTHER}; - } - escape_candidates_map[static_cast(quoterule_candidates[0])] = {options.escape}; - } else { - // no escape provided: try standard/common escapes - quoterule_candidates = {QuoteRule::QUOTES_RFC, QuoteRule::QUOTES_OTHER, QuoteRule::NO_QUOTES}; - } - - idx_t best_consistent_rows = 0; - idx_t prev_padding_count = 0; - for (auto quoterule : quoterule_candidates) { - const auto "e_candidates = quote_candidates_map[static_cast(quoterule)]; - for (const auto "e : quote_candidates) { - for (const auto &delim : delim_candidates) { - const auto &escape_candidates = escape_candidates_map[static_cast(quoterule)]; - for (const auto &escape : escape_candidates) { - BufferedCSVReaderOptions sniff_info = original_options; - sniff_info.delimiter = delim; - sniff_info.quote = quote; - sniff_info.escape = escape; - - options = sniff_info; - PrepareComplexParser(); - - JumpToBeginning(original_options.skip_rows); - sniffed_column_counts.clear(); - if (!TryParseCSV(ParserMode::SNIFFING_DIALECT)) { - continue; - } - - idx_t start_row = original_options.skip_rows; - idx_t consistent_rows = 0; - idx_t num_cols = sniffed_column_counts.empty() ? 0 : sniffed_column_counts[0]; - idx_t padding_count = 0; - bool allow_padding = original_options.null_padding; - for (idx_t row = 0; row < sniffed_column_counts.size(); row++) { - if (sniffed_column_counts[row] == num_cols) { - consistent_rows++; - } else if (num_cols < sniffed_column_counts[row] && !original_options.skip_rows_set) { - // we use the maximum amount of num_cols that we find - num_cols = sniffed_column_counts[row]; - start_row = row + original_options.skip_rows; - consistent_rows = 1; - padding_count = 0; - } else if (num_cols >= sniffed_column_counts[row] && allow_padding) { - // we are missing some columns, we can parse this as long as we add padding - padding_count++; - } - } - - // some logic - consistent_rows += padding_count; - bool more_values = (consistent_rows > best_consistent_rows && num_cols >= best_num_cols); - bool require_more_padding = padding_count > prev_padding_count; - bool require_less_padding = padding_count < prev_padding_count; - bool single_column_before = best_num_cols < 2 && num_cols > best_num_cols; - bool rows_consistent = - start_row + consistent_rows - original_options.skip_rows == sniffed_column_counts.size(); - bool more_than_one_row = (consistent_rows > 1); - bool more_than_one_column = (num_cols > 1); - bool start_good = !info_candidates.empty() && (start_row <= info_candidates.front().skip_rows); - - if (!requested_types.empty() && requested_types.size() != num_cols) { - continue; - } else if (rows_consistent && (single_column_before || (more_values && !require_more_padding) || - (more_than_one_column && require_less_padding))) { - sniff_info.skip_rows = start_row; - sniff_info.num_cols = num_cols; - sniff_info.new_line = options.new_line; - best_consistent_rows = consistent_rows; - best_num_cols = num_cols; - prev_padding_count = padding_count; - - info_candidates.clear(); - info_candidates.push_back(sniff_info); - } else if (more_than_one_row && more_than_one_column && start_good && rows_consistent && - !require_more_padding) { - bool same_quote_is_candidate = false; - for (auto &info_candidate : info_candidates) { - if (quote.compare(info_candidate.quote) == 0) { - same_quote_is_candidate = true; - } - } - if (!same_quote_is_candidate) { - sniff_info.skip_rows = start_row; - sniff_info.num_cols = num_cols; - sniff_info.new_line = options.new_line; - info_candidates.push_back(sniff_info); - } - } - } - } - } - } -} - -void BufferedCSVReader::DetectCandidateTypes(const vector &type_candidates, - const map> &format_template_candidates, - const vector &info_candidates, - BufferedCSVReaderOptions &original_options, idx_t best_num_cols, - vector> &best_sql_types_candidates, - std::map> &best_format_candidates, - DataChunk &best_header_row) { - BufferedCSVReaderOptions best_options; - idx_t min_varchar_cols = best_num_cols + 1; - - // check which info candidate leads to minimum amount of non-varchar columns... - for (const auto &t : format_template_candidates) { - best_format_candidates[t.first].clear(); - } - for (auto &info_candidate : info_candidates) { - options = info_candidate; - vector> info_sql_types_candidates(options.num_cols, type_candidates); - std::map has_format_candidates; - std::map> format_candidates; - for (const auto &t : format_template_candidates) { - has_format_candidates[t.first] = false; - format_candidates[t.first].clear(); - } - - // set all return_types to VARCHAR so we can do datatype detection based on VARCHAR values - return_types.clear(); - return_types.assign(options.num_cols, LogicalType::VARCHAR); - - // jump to beginning and skip potential header - JumpToBeginning(options.skip_rows, true); - DataChunk header_row; - header_row.Initialize(allocator, return_types); - parse_chunk.Copy(header_row); - - if (header_row.size() == 0) { - continue; - } - - // init parse chunk and read csv with info candidate - InitParseChunk(return_types.size()); - if (!TryParseCSV(ParserMode::SNIFFING_DATATYPES)) { - continue; - } - for (idx_t row_idx = 0; row_idx <= parse_chunk.size(); row_idx++) { - bool is_header_row = row_idx == 0; - idx_t row = row_idx - 1; - for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) { - auto &col_type_candidates = info_sql_types_candidates[col]; - while (col_type_candidates.size() > 1) { - const auto &sql_type = col_type_candidates.back(); - // try cast from string to sql_type - Value dummy_val; - if (is_header_row) { - VerifyUTF8(col, 0, header_row, -int64_t(parse_chunk.size())); - dummy_val = header_row.GetValue(col, 0); - } else { - VerifyUTF8(col, row, parse_chunk); - dummy_val = parse_chunk.GetValue(col, row); - } - // try formatting for date types if the user did not specify one and it starts with numeric values. - string separator; - if (has_format_candidates.count(sql_type.id()) && !original_options.has_format[sql_type.id()] && - !dummy_val.IsNull() && StartsWithNumericDate(separator, StringValue::Get(dummy_val))) { - // generate date format candidates the first time through - auto &type_format_candidates = format_candidates[sql_type.id()]; - const auto had_format_candidates = has_format_candidates[sql_type.id()]; - if (!has_format_candidates[sql_type.id()]) { - has_format_candidates[sql_type.id()] = true; - // order by preference - auto entry = format_template_candidates.find(sql_type.id()); - if (entry != format_template_candidates.end()) { - const auto &format_template_list = entry->second; - for (const auto &t : format_template_list) { - const auto format_string = GenerateDateFormat(separator, t); - // don't parse ISO 8601 - if (format_string.find("%Y-%m-%d") == string::npos) { - type_format_candidates.emplace_back(format_string); - } - } - } - // initialise the first candidate - options.has_format[sql_type.id()] = true; - // all formats are constructed to be valid - SetDateFormat(type_format_candidates.back(), sql_type.id()); - } - // check all formats and keep the first one that works - StrpTimeFormat::ParseResult result; - auto save_format_candidates = type_format_candidates; - while (!type_format_candidates.empty()) { - // avoid using exceptions for flow control... - auto ¤t_format = options.date_format[sql_type.id()]; - if (current_format.Parse(StringValue::Get(dummy_val), result)) { - break; - } - // doesn't work - move to the next one - type_format_candidates.pop_back(); - options.has_format[sql_type.id()] = (!type_format_candidates.empty()); - if (!type_format_candidates.empty()) { - SetDateFormat(type_format_candidates.back(), sql_type.id()); - } - } - // if none match, then this is not a value of type sql_type, - if (type_format_candidates.empty()) { - // so restore the candidates that did work. - // or throw them out if they were generated by this value. - if (had_format_candidates) { - type_format_candidates.swap(save_format_candidates); - if (!type_format_candidates.empty()) { - SetDateFormat(type_format_candidates.back(), sql_type.id()); - } - } else { - has_format_candidates[sql_type.id()] = false; - } - } - } - // try cast from string to sql_type - if (TryCastValue(dummy_val, sql_type)) { - break; - } else { - col_type_candidates.pop_back(); - } - } - } - // reset type detection, because first row could be header, - // but only do it if csv has more than one line (including header) - if (parse_chunk.size() > 0 && is_header_row) { - info_sql_types_candidates = vector>(options.num_cols, type_candidates); - for (auto &f : format_candidates) { - f.second.clear(); - } - for (auto &h : has_format_candidates) { - h.second = false; - } - } - } - - idx_t varchar_cols = 0; - for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) { - auto &col_type_candidates = info_sql_types_candidates[col]; - // check number of varchar columns - const auto &col_type = col_type_candidates.back(); - if (col_type == LogicalType::VARCHAR) { - varchar_cols++; - } - } - - // it's good if the dialect creates more non-varchar columns, but only if we sacrifice < 30% of best_num_cols. - if (varchar_cols < min_varchar_cols && parse_chunk.ColumnCount() > (best_num_cols * 0.7)) { - // we have a new best_options candidate - best_options = info_candidate; - min_varchar_cols = varchar_cols; - best_sql_types_candidates = info_sql_types_candidates; - best_format_candidates = format_candidates; - best_header_row.Destroy(); - auto header_row_types = header_row.GetTypes(); - best_header_row.Initialize(allocator, header_row_types); - header_row.Copy(best_header_row); - } - } - - options = best_options; - for (const auto &best : best_format_candidates) { - if (!best.second.empty()) { - SetDateFormat(best.second.back(), best.first); - } - } -} - -void BufferedCSVReader::DetectHeader(const vector> &best_sql_types_candidates, - const DataChunk &best_header_row) { - // information for header detection - bool first_row_consistent = true; - bool first_row_nulls = false; - - // check if header row is all null and/or consistent with detected column data types - first_row_nulls = true; - for (idx_t col = 0; col < best_sql_types_candidates.size(); col++) { - auto dummy_val = best_header_row.GetValue(col, 0); - if (!dummy_val.IsNull()) { - first_row_nulls = false; - } - - // try cast to sql_type of column - const auto &sql_type = best_sql_types_candidates[col].back(); - if (!TryCastValue(dummy_val, sql_type)) { - first_row_consistent = false; - } - } - - // update parser info, and read, generate & set col_names based on previous findings - if (((!first_row_consistent || first_row_nulls) && !options.has_header) || (options.has_header && options.header)) { - options.header = true; - case_insensitive_map_t name_collision_count; - // get header names from CSV - for (idx_t col = 0; col < options.num_cols; col++) { - const auto &val = best_header_row.GetValue(col, 0); - string col_name = val.ToString(); - - // generate name if field is empty - if (col_name.empty() || val.IsNull()) { - col_name = GenerateColumnName(options.num_cols, col); - } - - // normalize names or at least trim whitespace - if (options.normalize_names) { - col_name = NormalizeColumnName(col_name); - } else { - col_name = TrimWhitespace(col_name); - } - - // avoid duplicate header names - const string col_name_raw = col_name; - while (name_collision_count.find(col_name) != name_collision_count.end()) { - name_collision_count[col_name] += 1; - col_name = col_name + "_" + to_string(name_collision_count[col_name]); - } - - names.push_back(col_name); - name_collision_count[col_name] = 0; - } - - } else { - options.header = false; - for (idx_t col = 0; col < options.num_cols; col++) { - string column_name = GenerateColumnName(options.num_cols, col); - names.push_back(column_name); - } - } - for (idx_t i = 0; i < MinValue(names.size(), options.name_list.size()); i++) { - names[i] = options.name_list[i]; - } -} - -vector BufferedCSVReader::RefineTypeDetection(const vector &type_candidates, - const vector &requested_types, - vector> &best_sql_types_candidates, - map> &best_format_candidates) { - // for the type refine we set the SQL types to VARCHAR for all columns - return_types.clear(); - return_types.assign(options.num_cols, LogicalType::VARCHAR); - - vector detected_types; - - // if data types were provided, exit here if number of columns does not match - if (!requested_types.empty()) { - if (requested_types.size() != options.num_cols) { - throw InvalidInputException( - "Error while determining column types: found %lld columns but expected %d. (%s)", options.num_cols, - requested_types.size(), options.ToString()); - } else { - detected_types = requested_types; - } - } else if (options.all_varchar) { - // return all types varchar - detected_types = return_types; - } else { - // jump through the rest of the file and continue to refine the sql type guess - while (JumpToNextSample()) { - InitParseChunk(return_types.size()); - // if jump ends up a bad line, we just skip this chunk - if (!TryParseCSV(ParserMode::SNIFFING_DATATYPES)) { - continue; - } - for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) { - vector &col_type_candidates = best_sql_types_candidates[col]; - while (col_type_candidates.size() > 1) { - const auto &sql_type = col_type_candidates.back(); - // narrow down the date formats - if (best_format_candidates.count(sql_type.id())) { - auto &best_type_format_candidates = best_format_candidates[sql_type.id()]; - auto save_format_candidates = best_type_format_candidates; - while (!best_type_format_candidates.empty()) { - if (TryCastVector(parse_chunk.data[col], parse_chunk.size(), sql_type)) { - break; - } - // doesn't work - move to the next one - best_type_format_candidates.pop_back(); - options.has_format[sql_type.id()] = (!best_type_format_candidates.empty()); - if (!best_type_format_candidates.empty()) { - SetDateFormat(best_type_format_candidates.back(), sql_type.id()); - } - } - // if none match, then this is not a column of type sql_type, - if (best_type_format_candidates.empty()) { - // so restore the candidates that did work. - best_type_format_candidates.swap(save_format_candidates); - if (!best_type_format_candidates.empty()) { - SetDateFormat(best_type_format_candidates.back(), sql_type.id()); - } - } - } - - if (TryCastVector(parse_chunk.data[col], parse_chunk.size(), sql_type)) { - break; - } else { - col_type_candidates.pop_back(); - } - } - } - } - - // set sql types - for (auto &best_sql_types_candidate : best_sql_types_candidates) { - LogicalType d_type = best_sql_types_candidate.back(); - if (best_sql_types_candidate.size() == type_candidates.size()) { - d_type = LogicalType::VARCHAR; - } - detected_types.push_back(d_type); - } - } - - return detected_types; -} - -string BufferedCSVReader::ColumnTypesError(case_insensitive_map_t sql_types_per_column, - const vector &names) { - for (idx_t i = 0; i < names.size(); i++) { - auto it = sql_types_per_column.find(names[i]); - if (it != sql_types_per_column.end()) { - sql_types_per_column.erase(names[i]); - continue; - } - } - if (sql_types_per_column.empty()) { - return string(); - } - string exception = "COLUMN_TYPES error: Columns with names: "; - for (auto &col : sql_types_per_column) { - exception += "\"" + col.first + "\","; - } - exception.pop_back(); - exception += " do not exist in the CSV File"; - return exception; -} - -vector BufferedCSVReader::SniffCSV(const vector &requested_types) { - for (auto &type : requested_types) { - // auto detect for blobs not supported: there may be invalid UTF-8 in the file - if (type.id() == LogicalTypeId::BLOB) { - return requested_types; - } - } - - // ####### - // ### dialect detection - // ####### - BufferedCSVReaderOptions original_options = options; - vector info_candidates; - idx_t best_num_cols = 0; - - DetectDialect(requested_types, original_options, info_candidates, best_num_cols); - - // if no dialect candidate was found, then file was most likely empty and we throw an exception - if (info_candidates.empty()) { - throw InvalidInputException( - "Error in file \"%s\": CSV options could not be auto-detected. Consider setting parser options manually.", - options.file_path); - } - - // ####### - // ### type detection (initial) - // ####### - - // format template candidates, ordered by descending specificity (~ from high to low) - std::map> format_template_candidates = { - {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}}, - {LogicalTypeId::TIMESTAMP, - {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S", - "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S"}}, - }; - vector> best_sql_types_candidates; - map> best_format_candidates; - DataChunk best_header_row; - DetectCandidateTypes(options.auto_type_candidates, format_template_candidates, info_candidates, original_options, - best_num_cols, best_sql_types_candidates, best_format_candidates, best_header_row); - - if (best_format_candidates.empty() || best_header_row.size() == 0) { - throw InvalidInputException( - "Error in file \"%s\": CSV options could not be auto-detected. Consider setting parser options manually.", - original_options.file_path); - } - - // ####### - // ### header detection - // ####### - options.num_cols = best_num_cols; - DetectHeader(best_sql_types_candidates, best_header_row); - if (!options.sql_type_list.empty()) { - // user-defined types were supplied for certain columns - // override the types - if (!options.sql_types_per_column.empty()) { - // types supplied as name -> value map - idx_t found = 0; - for (idx_t i = 0; i < names.size(); i++) { - auto it = options.sql_types_per_column.find(names[i]); - if (it != options.sql_types_per_column.end()) { - best_sql_types_candidates[i] = {options.sql_type_list[it->second]}; - found++; - continue; - } - } - if (!options.file_options.union_by_name && found < options.sql_types_per_column.size()) { - string exception = ColumnTypesError(options.sql_types_per_column, names); - if (!exception.empty()) { - throw BinderException(exception); - } - } - } else { - // types supplied as list - if (names.size() < options.sql_type_list.size()) { - throw BinderException("read_csv: %d types were provided, but CSV file only has %d columns", - options.sql_type_list.size(), names.size()); - } - for (idx_t i = 0; i < options.sql_type_list.size(); i++) { - best_sql_types_candidates[i] = {options.sql_type_list[i]}; - } - } - } - - // ####### - // ### type detection (refining) - // ####### - return RefineTypeDetection(options.auto_type_candidates, requested_types, best_sql_types_candidates, - best_format_candidates); -} - -bool BufferedCSVReader::TryParseComplexCSV(DataChunk &insert_chunk, string &error_message) { - // used for parsing algorithm - bool finished_chunk = false; - idx_t column = 0; - vector escape_positions; - bool has_quotes = false; - uint8_t delimiter_pos = 0, escape_pos = 0, quote_pos = 0; - idx_t offset = 0; - idx_t line_start = 0; - // read values into the buffer (if any) - if (position >= buffer_size) { - if (!ReadBuffer(start, line_start)) { - return true; - } - } - // start parsing the first value - start = position; - goto value_start; -value_start: - /* state: value_start */ - // this state parses the first characters of a value - offset = 0; - delimiter_pos = 0; - quote_pos = 0; - do { - idx_t count = 0; - for (; position < buffer_size; position++) { - quote_search.Match(quote_pos, buffer[position]); - delimiter_search.Match(delimiter_pos, buffer[position]); - count++; - if (delimiter_pos == options.delimiter.size()) { - // found a delimiter, add the value - offset = options.delimiter.size() - 1; - goto add_value; - } else if (StringUtil::CharacterIsNewline(buffer[position])) { - // found a newline, add the row - goto add_row; - } - if (count > quote_pos) { - // did not find a quote directly at the start of the value, stop looking for the quote now - goto normal; - } - if (quote_pos == options.quote.size()) { - // found a quote, go to quoted loop and skip the initial quote - start += options.quote.size(); - goto in_quotes; - } - } - } while (ReadBuffer(start, line_start)); - // file ends while scanning for quote/delimiter, go to final state - goto final_state; -normal: - /* state: normal parsing state */ - // this state parses the remainder of a non-quoted value until we reach a delimiter or newline - position++; - do { - for (; position < buffer_size; position++) { - delimiter_search.Match(delimiter_pos, buffer[position]); - if (delimiter_pos == options.delimiter.size()) { - offset = options.delimiter.size() - 1; - goto add_value; - } else if (StringUtil::CharacterIsNewline(buffer[position])) { - goto add_row; - } - } - } while (ReadBuffer(start, line_start)); - goto final_state; -add_value: - AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes); - // increase position by 1 and move start to the new position - offset = 0; - has_quotes = false; - start = ++position; - if (position >= buffer_size && !ReadBuffer(start, line_start)) { - // file ends right after delimiter, go to final state - goto final_state; - } - goto value_start; -add_row : { - // check type of newline (\r or \n) - bool carriage_return = buffer[position] == '\r'; - AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes); - finished_chunk = AddRow(insert_chunk, column, error_message); - - if (!error_message.empty()) { - return false; - } - // increase position by 1 and move start to the new position - offset = 0; - has_quotes = false; - position++; - SkipEmptyLines(); - start = position; - if (position >= buffer_size && !ReadBuffer(start, line_start)) { - // file ends right after newline, go to final state - goto final_state; - } - if (carriage_return) { - // \r newline, go to special state that parses an optional \n afterwards - goto carriage_return; - } else { - // \n newline, move to value start - if (finished_chunk) { - return true; - } - goto value_start; - } -} -in_quotes: - /* state: in_quotes */ - // this state parses the remainder of a quoted value - quote_pos = 0; - escape_pos = 0; - has_quotes = true; - position++; - do { - for (; position < buffer_size; position++) { - quote_search.Match(quote_pos, buffer[position]); - escape_search.Match(escape_pos, buffer[position]); - if (quote_pos == options.quote.size()) { - goto unquote; - } else if (escape_pos == options.escape.size()) { - escape_positions.push_back(position - start - (options.escape.size() - 1)); - goto handle_escape; - } - } - } while (ReadBuffer(start, line_start)); - // still in quoted state at the end of the file, error: - error_message = StringUtil::Format("Error in file \"%s\" on line %s: unterminated quotes. (%s)", options.file_path, - GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString()); - return false; -unquote: - /* state: unquote */ - // this state handles the state directly after we unquote - // in this state we expect either another quote (entering the quoted state again, and escaping the quote) - // or a delimiter/newline, ending the current value and moving on to the next value - delimiter_pos = 0; - quote_pos = 0; - position++; - if (position >= buffer_size && !ReadBuffer(start, line_start)) { - // file ends right after unquote, go to final state - offset = options.quote.size(); - goto final_state; - } - if (StringUtil::CharacterIsNewline(buffer[position])) { - // quote followed by newline, add row - offset = options.quote.size(); - goto add_row; - } - do { - idx_t count = 0; - for (; position < buffer_size; position++) { - quote_search.Match(quote_pos, buffer[position]); - delimiter_search.Match(delimiter_pos, buffer[position]); - count++; - if (count > delimiter_pos && count > quote_pos) { - error_message = StringUtil::Format( - "Error in file \"%s\" on line %s: quote should be followed by end of value, end " - "of row or another quote. (%s)", - options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString()); - return false; - } - if (delimiter_pos == options.delimiter.size()) { - // quote followed by delimiter, add value - offset = options.quote.size() + options.delimiter.size() - 1; - goto add_value; - } else if (quote_pos == options.quote.size() && - (options.escape.empty() || options.escape == options.quote)) { - // quote followed by quote, go back to quoted state and add to escape - escape_positions.push_back(position - start - (options.quote.size() - 1)); - goto in_quotes; - } - } - } while (ReadBuffer(start, line_start)); - error_message = StringUtil::Format( - "Error in file \"%s\" on line %s: quote should be followed by end of value, end of row or another quote. (%s)", - options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString()); - return false; -handle_escape: - escape_pos = 0; - quote_pos = 0; - position++; - do { - idx_t count = 0; - for (; position < buffer_size; position++) { - quote_search.Match(quote_pos, buffer[position]); - escape_search.Match(escape_pos, buffer[position]); - count++; - if (count > escape_pos && count > quote_pos) { - error_message = StringUtil::Format( - "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", - options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString()); - return false; - } - if (quote_pos == options.quote.size() || escape_pos == options.escape.size()) { - // found quote or escape: move back to quoted state - goto in_quotes; - } - } - } while (ReadBuffer(start, line_start)); - error_message = - StringUtil::Format("Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", - options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString()); - return false; -carriage_return: - /* state: carriage_return */ - // this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line - if (buffer[position] == '\n') { - // newline after carriage return: skip - start = ++position; - if (position >= buffer_size && !ReadBuffer(start, line_start)) { - // file ends right after newline, go to final state - goto final_state; - } - } - if (finished_chunk) { - return true; - } - goto value_start; -final_state: - if (finished_chunk) { - return true; - } - if (column > 0 || position > start) { - // remaining values to be added to the chunk - AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes); - finished_chunk = AddRow(insert_chunk, column, error_message); - SkipEmptyLines(); - if (!error_message.empty()) { - return false; - } - } - // final stage, only reached after parsing the file is finished - // flush the parsed chunk and finalize parsing - if (mode == ParserMode::PARSING) { - Flush(insert_chunk); - } - - end_of_file_reached = true; - return true; -} - -void BufferedCSVReader::SkipEmptyLines() { - if (parse_chunk.data.size() == 1) { - // Empty lines are null data. - return; - } - for (; position < buffer_size; position++) { - if (!StringUtil::CharacterIsNewline(buffer[position])) { - return; - } - } -} - -void UpdateMaxLineLength(ClientContext &context, idx_t line_length) { - if (!context.client_data->debug_set_max_line_length) { - return; - } - if (line_length < context.client_data->debug_max_line_length) { - return; - } - context.client_data->debug_max_line_length = line_length; -} - -bool BufferedCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message) { - // used for parsing algorithm - bool finished_chunk = false; - idx_t column = 0; - idx_t offset = 0; - bool has_quotes = false; - vector escape_positions; - - idx_t line_start = position; - // read values into the buffer (if any) - if (position >= buffer_size) { - if (!ReadBuffer(start, line_start)) { - return true; - } - } - - // start parsing the first value - goto value_start; -value_start: - offset = 0; - /* state: value_start */ - // this state parses the first character of a value - if (buffer[position] == options.quote[0]) { - // quote: actual value starts in the next position - // move to in_quotes state - start = position + 1; - goto in_quotes; - } else { - // no quote, move to normal parsing state - start = position; - goto normal; - } -normal: - /* state: normal parsing state */ - // this state parses the remainder of a non-quoted value until we reach a delimiter or newline - do { - for (; position < buffer_size; position++) { - if (buffer[position] == options.delimiter[0]) { - // delimiter: end the value and add it to the chunk - goto add_value; - } else if (StringUtil::CharacterIsNewline(buffer[position])) { - // newline: add row - goto add_row; - } - } - } while (ReadBuffer(start, line_start)); - // file ends during normal scan: go to end state - goto final_state; -add_value: - AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes); - // increase position by 1 and move start to the new position - offset = 0; - has_quotes = false; - start = ++position; - if (position >= buffer_size && !ReadBuffer(start, line_start)) { - // file ends right after delimiter, go to final state - goto final_state; - } - goto value_start; -add_row : { - // check type of newline (\r or \n) - bool carriage_return = buffer[position] == '\r'; - AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes); - if (!error_message.empty()) { - return false; - } - finished_chunk = AddRow(insert_chunk, column, error_message); - UpdateMaxLineLength(context, position - line_start); - if (!error_message.empty()) { - return false; - } - // increase position by 1 and move start to the new position - offset = 0; - has_quotes = false; - position++; - start = position; - line_start = position; - if (position >= buffer_size && !ReadBuffer(start, line_start)) { - // file ends right after delimiter, go to final state - goto final_state; - } - if (carriage_return) { - // \r newline, go to special state that parses an optional \n afterwards - goto carriage_return; - } else { - SetNewLineDelimiter(); - SkipEmptyLines(); - - start = position; - line_start = position; - if (position >= buffer_size && !ReadBuffer(start, line_start)) { - // file ends right after delimiter, go to final state - goto final_state; - } - // \n newline, move to value start - if (finished_chunk) { - return true; - } - goto value_start; - } -} -in_quotes: - /* state: in_quotes */ - // this state parses the remainder of a quoted value - has_quotes = true; - position++; - do { - for (; position < buffer_size; position++) { - if (buffer[position] == options.quote[0]) { - // quote: move to unquoted state - goto unquote; - } else if (buffer[position] == options.escape[0]) { - // escape: store the escaped position and move to handle_escape state - escape_positions.push_back(position - start); - goto handle_escape; - } - } - } while (ReadBuffer(start, line_start)); - // still in quoted state at the end of the file, error: - throw InvalidInputException("Error in file \"%s\" on line %s: unterminated quotes. (%s)", options.file_path, - GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString()); -unquote: - /* state: unquote */ - // this state handles the state directly after we unquote - // in this state we expect either another quote (entering the quoted state again, and escaping the quote) - // or a delimiter/newline, ending the current value and moving on to the next value - position++; - if (position >= buffer_size && !ReadBuffer(start, line_start)) { - // file ends right after unquote, go to final state - offset = 1; - goto final_state; - } - if (buffer[position] == options.quote[0] && (options.escape.empty() || options.escape[0] == options.quote[0])) { - // escaped quote, return to quoted state and store escape position - escape_positions.push_back(position - start); - goto in_quotes; - } else if (buffer[position] == options.delimiter[0]) { - // delimiter, add value - offset = 1; - goto add_value; - } else if (StringUtil::CharacterIsNewline(buffer[position])) { - offset = 1; - goto add_row; - } else { - error_message = StringUtil::Format( - "Error in file \"%s\" on line %s: quote should be followed by end of value, end of " - "row or another quote. (%s)", - options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString()); - return false; - } -handle_escape: - /* state: handle_escape */ - // escape should be followed by a quote or another escape character - position++; - if (position >= buffer_size && !ReadBuffer(start, line_start)) { - error_message = StringUtil::Format( - "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path, - GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString()); - return false; - } - if (buffer[position] != options.quote[0] && buffer[position] != options.escape[0]) { - error_message = StringUtil::Format( - "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path, - GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString()); - return false; - } - // escape was followed by quote or escape, go back to quoted state - goto in_quotes; -carriage_return: - /* state: carriage_return */ - // this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line - if (buffer[position] == '\n') { - SetNewLineDelimiter(true, true); - // newline after carriage return: skip - // increase position by 1 and move start to the new position - start = ++position; - if (position >= buffer_size && !ReadBuffer(start, line_start)) { - // file ends right after delimiter, go to final state - goto final_state; - } - } else { - SetNewLineDelimiter(true, false); - } - if (finished_chunk) { - return true; - } - SkipEmptyLines(); - start = position; - line_start = position; - if (position >= buffer_size && !ReadBuffer(start, line_start)) { - // file ends right after delimiter, go to final state - goto final_state; - } - - goto value_start; -final_state: - if (finished_chunk) { - return true; - } - - if (column > 0 || position > start) { - // remaining values to be added to the chunk - AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes); - finished_chunk = AddRow(insert_chunk, column, error_message); - SkipEmptyLines(); - UpdateMaxLineLength(context, position - line_start); - if (!error_message.empty()) { - return false; - } - } - - // final stage, only reached after parsing the file is finished - // flush the parsed chunk and finalize parsing - if (mode == ParserMode::PARSING) { - Flush(insert_chunk); - } - - end_of_file_reached = true; - return true; -} - -bool BufferedCSVReader::ReadBuffer(idx_t &start, idx_t &line_start) { - if (start > buffer_size) { - return false; - } - auto old_buffer = std::move(buffer); - - // the remaining part of the last buffer - idx_t remaining = buffer_size - start; - - bool large_buffers = mode == ParserMode::PARSING && !file_handle->OnDiskFile() && file_handle->CanSeek(); - idx_t buffer_read_size = large_buffers ? INITIAL_BUFFER_SIZE_LARGE : INITIAL_BUFFER_SIZE; - - while (remaining > buffer_read_size) { - buffer_read_size *= 2; - } - - // Check line length - if (remaining > options.maximum_line_size) { - throw InvalidInputException("Maximum line size of %llu bytes exceeded on line %s!", options.maximum_line_size, - GetLineNumberStr(linenr, linenr_estimated)); - } - - buffer = make_unsafe_uniq_array(buffer_read_size + remaining + 1); - buffer_size = remaining + buffer_read_size; - if (remaining > 0) { - // remaining from last buffer: copy it here - memcpy(buffer.get(), old_buffer.get() + start, remaining); - } - idx_t read_count = file_handle->Read(buffer.get() + remaining, buffer_read_size); - - bytes_in_chunk += read_count; - buffer_size = remaining + read_count; - buffer[buffer_size] = '\0'; - if (old_buffer) { - cached_buffers.push_back(std::move(old_buffer)); - } - start = 0; - position = remaining; - if (!bom_checked) { - bom_checked = true; - if (read_count >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') { - start += 3; - position += 3; - } - } - line_start = start; - - return read_count > 0; -} - -void BufferedCSVReader::ParseCSV(DataChunk &insert_chunk) { - string error_message; - if (!TryParseCSV(ParserMode::PARSING, insert_chunk, error_message)) { - throw InvalidInputException(error_message); - } -} - -bool BufferedCSVReader::TryParseCSV(ParserMode mode) { - DataChunk dummy_chunk; - string error_message; - return TryParseCSV(mode, dummy_chunk, error_message); -} - -void BufferedCSVReader::ParseCSV(ParserMode mode) { - DataChunk dummy_chunk; - string error_message; - if (!TryParseCSV(mode, dummy_chunk, error_message)) { - throw InvalidInputException(error_message); - } -} - -bool BufferedCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_chunk, string &error_message) { - mode = parser_mode; - - if (options.quote.size() <= 1 && options.escape.size() <= 1 && options.delimiter.size() == 1) { - return TryParseSimpleCSV(insert_chunk, error_message); - } else { - return TryParseComplexCSV(insert_chunk, error_message); - } -} - -} // namespace duckdb diff --git a/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp b/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp deleted file mode 100644 index e24662894..000000000 --- a/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +++ /dev/null @@ -1,72 +0,0 @@ -#include "duckdb/execution/operator/persistent/csv_buffer.hpp" -#include "duckdb/common/string_util.hpp" - -namespace duckdb { - -CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle, - idx_t &global_csv_current_position, idx_t file_number_p) - : context(context), first_buffer(true), file_number(file_number_p) { - this->handle = AllocateBuffer(buffer_size_p); - - auto buffer = Ptr(); - actual_size = file_handle.Read(buffer, buffer_size_p); - global_csv_start = global_csv_current_position; - global_csv_current_position += actual_size; - if (actual_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') { - start_position += 3; - } - last_buffer = file_handle.FinishedReading(); -} - -CSVBuffer::CSVBuffer(ClientContext &context, BufferHandle buffer_p, idx_t buffer_size_p, idx_t actual_size_p, - bool final_buffer, idx_t global_csv_current_position, idx_t file_number_p) - : context(context), handle(std::move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer), - global_csv_start(global_csv_current_position), file_number(file_number_p) { -} - -unique_ptr CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position, - idx_t file_number_p) { - auto next_buffer = AllocateBuffer(buffer_size); - idx_t next_buffer_actual_size = file_handle.Read(next_buffer.Ptr(), buffer_size); - if (next_buffer_actual_size == 0) { - // We are done reading - return nullptr; - } - - auto next_csv_buffer = - make_uniq(context, std::move(next_buffer), buffer_size, next_buffer_actual_size, - file_handle.FinishedReading(), global_csv_current_position, file_number_p); - global_csv_current_position += next_buffer_actual_size; - return next_csv_buffer; -} - -BufferHandle CSVBuffer::AllocateBuffer(idx_t buffer_size) { - auto &buffer_manager = BufferManager::GetBufferManager(context); - return buffer_manager.Allocate(MaxValue(Storage::BLOCK_SIZE, buffer_size)); -} - -idx_t CSVBuffer::GetBufferSize() { - return actual_size; -} - -idx_t CSVBuffer::GetStart() { - return start_position; -} - -bool CSVBuffer::IsCSVFileLastBuffer() { - return last_buffer; -} - -bool CSVBuffer::IsCSVFileFirstBuffer() { - return first_buffer; -} - -idx_t CSVBuffer::GetCSVGlobalStart() { - return global_csv_start; -} - -idx_t CSVBuffer::GetFileNumber() { - return file_number; -} - -} // namespace duckdb diff --git a/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp b/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp deleted file mode 100644 index 03ef4ede5..000000000 --- a/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +++ /dev/null @@ -1,158 +0,0 @@ -#include "duckdb/execution/operator/persistent/csv_file_handle.hpp" - -namespace duckdb { - -CSVFileHandle::CSVFileHandle(FileSystem &fs, Allocator &allocator, unique_ptr file_handle_p, - const string &path_p, FileCompressionType compression, bool enable_reset) - : fs(fs), allocator(allocator), file_handle(std::move(file_handle_p)), path(path_p), compression(compression), - reset_enabled(enable_reset) { - can_seek = file_handle->CanSeek(); - on_disk_file = file_handle->OnDiskFile(); - file_size = file_handle->GetFileSize(); -} - -unique_ptr CSVFileHandle::OpenFileHandle(FileSystem &fs, Allocator &allocator, const string &path, - FileCompressionType compression) { - auto file_handle = fs.OpenFile(path, FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK, compression); - if (file_handle->CanSeek()) { - file_handle->Reset(); - } - return file_handle; -} - -unique_ptr CSVFileHandle::OpenFile(FileSystem &fs, Allocator &allocator, const string &path, - FileCompressionType compression, bool enable_reset) { - auto file_handle = CSVFileHandle::OpenFileHandle(fs, allocator, path, compression); - return make_uniq(fs, allocator, std::move(file_handle), path, compression, enable_reset); -} - -bool CSVFileHandle::CanSeek() { - return can_seek; -} - -void CSVFileHandle::Seek(idx_t position) { - if (!can_seek) { - throw InternalException("Cannot seek in this file"); - } - file_handle->Seek(position); -} - -idx_t CSVFileHandle::SeekPosition() { - if (!can_seek) { - throw InternalException("Cannot seek in this file"); - } - return file_handle->SeekPosition(); -} - -void CSVFileHandle::Reset() { - requested_bytes = 0; - read_position = 0; - if (can_seek) { - // we can seek - reset the file handle - file_handle->Reset(); - } else if (on_disk_file) { - // we cannot seek but it is an on-disk file - re-open the file - file_handle = CSVFileHandle::OpenFileHandle(fs, allocator, path, compression); - } else { - if (!reset_enabled) { - throw InternalException("Reset called but reset is not enabled for this CSV Handle"); - } - read_position = 0; - } -} -bool CSVFileHandle::OnDiskFile() { - return on_disk_file; -} - -idx_t CSVFileHandle::FileSize() { - return file_size; -} - -bool CSVFileHandle::FinishedReading() { - return requested_bytes >= file_size; -} - -idx_t CSVFileHandle::Read(void *buffer, idx_t nr_bytes) { - requested_bytes += nr_bytes; - if (on_disk_file || can_seek) { - // if this is a plain file source OR we can seek we are not caching anything - return file_handle->Read(buffer, nr_bytes); - } - // not a plain file source: we need to do some bookkeeping around the reset functionality - idx_t result_offset = 0; - if (read_position < buffer_size) { - // we need to read from our cached buffer - auto buffer_read_count = MinValue(nr_bytes, buffer_size - read_position); - memcpy(buffer, cached_buffer.get() + read_position, buffer_read_count); - result_offset += buffer_read_count; - read_position += buffer_read_count; - if (result_offset == nr_bytes) { - return nr_bytes; - } - } else if (!reset_enabled && cached_buffer.IsSet()) { - // reset is disabled, but we still have cached data - // we can remove any cached data - cached_buffer.Reset(); - buffer_size = 0; - buffer_capacity = 0; - read_position = 0; - } - // we have data left to read from the file - // read directly into the buffer - auto bytes_read = file_handle->Read(char_ptr_cast(buffer) + result_offset, nr_bytes - result_offset); - file_size = file_handle->GetFileSize(); - read_position += bytes_read; - if (reset_enabled) { - // if reset caching is enabled, we need to cache the bytes that we have read - if (buffer_size + bytes_read >= buffer_capacity) { - // no space; first enlarge the buffer - buffer_capacity = MaxValue(NextPowerOfTwo(buffer_size + bytes_read), buffer_capacity * 2); - - auto new_buffer = allocator.Allocate(buffer_capacity); - if (buffer_size > 0) { - memcpy(new_buffer.get(), cached_buffer.get(), buffer_size); - } - cached_buffer = std::move(new_buffer); - } - memcpy(cached_buffer.get() + buffer_size, char_ptr_cast(buffer) + result_offset, bytes_read); - buffer_size += bytes_read; - } - - return result_offset + bytes_read; -} - -string CSVFileHandle::ReadLine() { - bool carriage_return = false; - string result; - char buffer[1]; - while (true) { - idx_t bytes_read = Read(buffer, 1); - if (bytes_read == 0) { - return result; - } - if (carriage_return) { - if (buffer[0] != '\n') { - if (!file_handle->CanSeek()) { - throw BinderException( - "Carriage return newlines not supported when reading CSV files in which we cannot seek"); - } - file_handle->Seek(file_handle->SeekPosition() - 1); - return result; - } - } - if (buffer[0] == '\n') { - return result; - } - if (buffer[0] != '\r') { - result += buffer[0]; - } else { - carriage_return = true; - } - } -} - -void CSVFileHandle::DisableReset() { - this->reset_enabled = false; -} - -} // namespace duckdb diff --git a/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp b/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp deleted file mode 100644 index 84df782ae..000000000 --- a/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +++ /dev/null @@ -1,280 +0,0 @@ -#include "duckdb/execution/operator/persistent/csv_reader_options.hpp" -#include "duckdb/common/bind_helpers.hpp" -#include "duckdb/common/vector_size.hpp" -#include "duckdb/common/string_util.hpp" - -namespace duckdb { - -static bool ParseBoolean(const Value &value, const string &loption); - -static bool ParseBoolean(const vector &set, const string &loption) { - if (set.empty()) { - // no option specified: default to true - return true; - } - if (set.size() > 1) { - throw BinderException("\"%s\" expects a single argument as a boolean value (e.g. TRUE or 1)", loption); - } - return ParseBoolean(set[0], loption); -} - -static bool ParseBoolean(const Value &value, const string &loption) { - - if (value.type().id() == LogicalTypeId::LIST) { - auto &children = ListValue::GetChildren(value); - return ParseBoolean(children, loption); - } - if (value.type() == LogicalType::FLOAT || value.type() == LogicalType::DOUBLE || - value.type().id() == LogicalTypeId::DECIMAL) { - throw BinderException("\"%s\" expects a boolean value (e.g. TRUE or 1)", loption); - } - return BooleanValue::Get(value.DefaultCastAs(LogicalType::BOOLEAN)); -} - -static string ParseString(const Value &value, const string &loption) { - if (value.IsNull()) { - return string(); - } - if (value.type().id() == LogicalTypeId::LIST) { - auto &children = ListValue::GetChildren(value); - if (children.size() != 1) { - throw BinderException("\"%s\" expects a single argument as a string value", loption); - } - return ParseString(children[0], loption); - } - if (value.type().id() != LogicalTypeId::VARCHAR) { - throw BinderException("\"%s\" expects a string argument!", loption); - } - return value.GetValue(); -} - -static int64_t ParseInteger(const Value &value, const string &loption) { - if (value.type().id() == LogicalTypeId::LIST) { - auto &children = ListValue::GetChildren(value); - if (children.size() != 1) { - // no option specified or multiple options specified - throw BinderException("\"%s\" expects a single argument as an integer value", loption); - } - return ParseInteger(children[0], loption); - } - return value.GetValue(); -} - -void BufferedCSVReaderOptions::SetHeader(bool input) { - this->header = input; - this->has_header = true; -} - -void BufferedCSVReaderOptions::SetCompression(const string &compression_p) { - this->compression = FileCompressionTypeFromString(compression_p); -} - -void BufferedCSVReaderOptions::SetEscape(const string &input) { - this->escape = input; - this->has_escape = true; -} - -void BufferedCSVReaderOptions::SetDelimiter(const string &input) { - this->delimiter = StringUtil::Replace(input, "\\t", "\t"); - this->has_delimiter = true; - if (input.empty()) { - this->delimiter = string("\0", 1); - } -} - -void BufferedCSVReaderOptions::SetQuote(const string "e_p) { - this->quote = quote_p; - this->has_quote = true; -} - -void BufferedCSVReaderOptions::SetNewline(const string &input) { - if (input == "\\n" || input == "\\r") { - new_line = NewLineIdentifier::SINGLE; - } else if (input == "\\r\\n") { - new_line = NewLineIdentifier::CARRY_ON; - } else { - throw InvalidInputException("This is not accepted as a newline: " + input); - } - has_newline = true; -} - -void BufferedCSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) { - string error; - if (read_format) { - error = StrTimeFormat::ParseFormatSpecifier(format, date_format[type]); - date_format[type].format_specifier = format; - } else { - error = StrTimeFormat::ParseFormatSpecifier(format, write_date_format[type]); - } - if (!error.empty()) { - throw InvalidInputException("Could not parse DATEFORMAT: %s", error.c_str()); - } - has_format[type] = true; -} - -void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value &value, - vector &expected_names) { - if (SetBaseOption(loption, value)) { - return; - } - if (loption == "auto_detect") { - auto_detect = ParseBoolean(value, loption); - } else if (loption == "sample_size") { - int64_t sample_size = ParseInteger(value, loption); - if (sample_size < 1 && sample_size != -1) { - throw BinderException("Unsupported parameter for SAMPLE_SIZE: cannot be smaller than 1"); - } - if (sample_size == -1) { - sample_chunks = std::numeric_limits::max(); - sample_chunk_size = STANDARD_VECTOR_SIZE; - } else if (sample_size <= STANDARD_VECTOR_SIZE) { - sample_chunk_size = sample_size; - sample_chunks = 1; - } else { - sample_chunk_size = STANDARD_VECTOR_SIZE; - sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1; - } - } else if (loption == "skip") { - skip_rows = ParseInteger(value, loption); - skip_rows_set = true; - } else if (loption == "max_line_size" || loption == "maximum_line_size") { - maximum_line_size = ParseInteger(value, loption); - } else if (loption == "sample_chunk_size") { - sample_chunk_size = ParseInteger(value, loption); - if (sample_chunk_size > STANDARD_VECTOR_SIZE) { - throw BinderException( - "Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d", - STANDARD_VECTOR_SIZE); - } else if (sample_chunk_size < 1) { - throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1"); - } - } else if (loption == "sample_chunks") { - sample_chunks = ParseInteger(value, loption); - if (sample_chunks < 1) { - throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1"); - } - } else if (loption == "force_not_null") { - force_not_null = ParseColumnList(value, expected_names, loption); - } else if (loption == "date_format" || loption == "dateformat") { - string format = ParseString(value, loption); - SetDateFormat(LogicalTypeId::DATE, format, true); - } else if (loption == "timestamp_format" || loption == "timestampformat") { - string format = ParseString(value, loption); - SetDateFormat(LogicalTypeId::TIMESTAMP, format, true); - } else if (loption == "ignore_errors") { - ignore_errors = ParseBoolean(value, loption); - } else if (loption == "buffer_size") { - buffer_size = ParseInteger(value, loption); - if (buffer_size == 0) { - throw InvalidInputException("Buffer Size option must be higher than 0"); - } - } else if (loption == "decimal_separator") { - decimal_separator = ParseString(value, loption); - if (decimal_separator != "." && decimal_separator != ",") { - throw BinderException("Unsupported parameter for DECIMAL_SEPARATOR: should be '.' or ','"); - } - } else if (loption == "null_padding") { - null_padding = ParseBoolean(value, loption); - } else if (loption == "allow_quoted_nulls") { - allow_quoted_nulls = ParseBoolean(value, loption); - } else if (loption == "parallel") { - parallel_mode = ParseBoolean(value, loption) ? ParallelMode::PARALLEL : ParallelMode::SINGLE_THREADED; - } else if (loption == "rejects_table") { - // skip, handled in SetRejectsOptions - auto table_name = ParseString(value, loption); - if (table_name.empty()) { - throw BinderException("REJECTS_TABLE option cannot be empty"); - } - rejects_table_name = table_name; - } else if (loption == "rejects_recovery_columns") { - // Get the list of columns to use as a recovery key - auto &children = ListValue::GetChildren(value); - for (auto &child : children) { - auto col_name = child.GetValue(); - rejects_recovery_columns.push_back(col_name); - } - } else if (loption == "rejects_limit") { - int64_t limit = ParseInteger(value, loption); - if (limit < 0) { - throw BinderException("Unsupported parameter for REJECTS_LIMIT: cannot be negative"); - } - rejects_limit = limit; - } else { - throw BinderException("Unrecognized option for CSV reader \"%s\"", loption); - } -} - -void BufferedCSVReaderOptions::SetWriteOption(const string &loption, const Value &value) { - if (loption == "new_line") { - // Steal this from SetBaseOption so we can write different newlines (e.g., format JSON ARRAY) - write_newline = ParseString(value, loption); - return; - } - - if (SetBaseOption(loption, value)) { - return; - } - - if (loption == "force_quote") { - force_quote = ParseColumnList(value, name_list, loption); - } else if (loption == "date_format" || loption == "dateformat") { - string format = ParseString(value, loption); - SetDateFormat(LogicalTypeId::DATE, format, false); - } else if (loption == "timestamp_format" || loption == "timestampformat") { - string format = ParseString(value, loption); - if (StringUtil::Lower(format) == "iso") { - format = "%Y-%m-%dT%H:%M:%S.%fZ"; - } - SetDateFormat(LogicalTypeId::TIMESTAMP, format, false); - SetDateFormat(LogicalTypeId::TIMESTAMP_TZ, format, false); - } else if (loption == "prefix") { - prefix = ParseString(value, loption); - } else if (loption == "suffix") { - suffix = ParseString(value, loption); - } else { - throw BinderException("Unrecognized option CSV writer \"%s\"", loption); - } -} - -bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value &value) { - // Make sure this function was only called after the option was turned into lowercase - D_ASSERT(!std::any_of(loption.begin(), loption.end(), ::isupper)); - - if (StringUtil::StartsWith(loption, "delim") || StringUtil::StartsWith(loption, "sep")) { - SetDelimiter(ParseString(value, loption)); - } else if (loption == "quote") { - SetQuote(ParseString(value, loption)); - } else if (loption == "new_line") { - SetNewline(ParseString(value, loption)); - } else if (loption == "escape") { - SetEscape(ParseString(value, loption)); - } else if (loption == "header") { - SetHeader(ParseBoolean(value, loption)); - } else if (loption == "null" || loption == "nullstr") { - null_str = ParseString(value, loption); - } else if (loption == "encoding") { - auto encoding = StringUtil::Lower(ParseString(value, loption)); - if (encoding != "utf8" && encoding != "utf-8") { - throw BinderException("Copy is only supported for UTF-8 encoded files, ENCODING 'UTF-8'"); - } - } else if (loption == "compression") { - SetCompression(ParseString(value, loption)); - } else { - // unrecognized option in base CSV - return false; - } - return true; -} - -std::string BufferedCSVReaderOptions::ToString() const { - return " file=" + file_path + "\n delimiter='" + delimiter + - (has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n quote='" + quote + - (has_quote ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n escape='" + escape + - (has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + - "\n header=" + std::to_string(header) + - (has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) + - "\n sample_size=" + std::to_string(sample_chunk_size * sample_chunks) + - "\n ignore_errors=" + std::to_string(ignore_errors) + "\n all_varchar=" + std::to_string(all_varchar); -} - -} // namespace duckdb diff --git a/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp b/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp deleted file mode 100644 index 6b2a58312..000000000 --- a/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +++ /dev/null @@ -1,666 +0,0 @@ -#include "duckdb/execution/operator/persistent/parallel_csv_reader.hpp" - -#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" -#include "duckdb/common/file_system.hpp" -#include "duckdb/common/string_util.hpp" -#include "duckdb/common/to_string.hpp" -#include "duckdb/common/types/cast_helpers.hpp" -#include "duckdb/common/vector_operations/unary_executor.hpp" -#include "duckdb/common/vector_operations/vector_operations.hpp" -#include "duckdb/function/scalar/strftime_format.hpp" -#include "duckdb/main/database.hpp" -#include "duckdb/parser/column_definition.hpp" -#include "duckdb/storage/data_table.hpp" -#include "utf8proc_wrapper.hpp" -#include "utf8proc.hpp" -#include "duckdb/parser/keyword_helper.hpp" -#include "duckdb/function/table/read_csv.hpp" -#include "duckdb/execution/operator/persistent/csv_line_info.hpp" - -#include -#include -#include -#include - -namespace duckdb { - -ParallelCSVReader::ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p, - unique_ptr buffer_p, idx_t first_pos_first_buffer_p, - const vector &requested_types, idx_t file_idx_p) - : BaseCSVReader(context, std::move(options_p), requested_types), file_idx(file_idx_p), - first_pos_first_buffer(first_pos_first_buffer_p) { - Initialize(requested_types); - SetBufferRead(std::move(buffer_p)); - if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) { - throw InternalException("Parallel CSV reader cannot handle CSVs with multi-byte delimiters/escapes/quotes"); - } -} - -void ParallelCSVReader::Initialize(const vector &requested_types) { - return_types = requested_types; - InitParseChunk(return_types.size()); -} - -bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl, bool first_char) { - // Set the delimiter if not set yet. - SetNewLineDelimiter(carry, carry_followed_by_nl); - D_ASSERT(options.new_line == NewLineIdentifier::SINGLE || options.new_line == NewLineIdentifier::CARRY_ON); - if (options.new_line == NewLineIdentifier::SINGLE) { - return (!carry) || (carry && !carry_followed_by_nl); - } - return (carry && carry_followed_by_nl) || (!carry && first_char); -} - -void ParallelCSVReader::SkipEmptyLines() { - idx_t new_pos_buffer = position_buffer; - if (parse_chunk.data.size() == 1) { - // Empty lines are null data. - return; - } - for (; new_pos_buffer < end_buffer; new_pos_buffer++) { - if (StringUtil::CharacterIsNewline((*buffer)[new_pos_buffer])) { - bool carrier_return = (*buffer)[new_pos_buffer] == '\r'; - new_pos_buffer++; - if (carrier_return && new_pos_buffer < buffer_size && (*buffer)[new_pos_buffer] == '\n') { - position_buffer++; - } - if (new_pos_buffer > end_buffer) { - return; - } - position_buffer = new_pos_buffer; - } else if ((*buffer)[new_pos_buffer] != ' ') { - return; - } - } -} - -bool ParallelCSVReader::SetPosition() { - if (buffer->buffer->IsCSVFileFirstBuffer() && start_buffer == position_buffer && - start_buffer == first_pos_first_buffer) { - start_buffer = buffer->buffer->GetStart(); - position_buffer = start_buffer; - verification_positions.beginning_of_first_line = position_buffer; - verification_positions.end_of_last_line = position_buffer; - // First buffer doesn't need any setting - - if (options.header) { - for (; position_buffer < end_buffer; position_buffer++) { - if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) { - bool carrier_return = (*buffer)[position_buffer] == '\r'; - position_buffer++; - if (carrier_return && position_buffer < buffer_size && (*buffer)[position_buffer] == '\n') { - position_buffer++; - } - if (position_buffer > end_buffer) { - return false; - } - SkipEmptyLines(); - if (verification_positions.beginning_of_first_line == 0) { - verification_positions.beginning_of_first_line = position_buffer; - } - - verification_positions.end_of_last_line = position_buffer; - return true; - } - } - return false; - } - SkipEmptyLines(); - if (verification_positions.beginning_of_first_line == 0) { - verification_positions.beginning_of_first_line = position_buffer; - } - - verification_positions.end_of_last_line = position_buffer; - return true; - } - - // We have to move position up to next new line - idx_t end_buffer_real = end_buffer; - // Check if we already start in a valid line - string error_message; - bool successfully_read_first_line = false; - while (!successfully_read_first_line) { - DataChunk first_line_chunk; - first_line_chunk.Initialize(allocator, return_types); - // Ensure that parse_chunk has no gunk when trying to figure new line - parse_chunk.Reset(); - for (; position_buffer < end_buffer; position_buffer++) { - if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) { - bool carriage_return = (*buffer)[position_buffer] == '\r'; - bool carriage_return_followed = false; - position_buffer++; - if (position_buffer < end_buffer) { - if (carriage_return && (*buffer)[position_buffer] == '\n') { - carriage_return_followed = true; - position_buffer++; - } - } - if (NewLineDelimiter(carriage_return, carriage_return_followed, position_buffer - 1 == start_buffer)) { - break; - } - } - } - SkipEmptyLines(); - - if (position_buffer > buffer_size) { - break; - } - - if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) { - break; - } - - if (position_buffer > end_buffer && options.new_line == NewLineIdentifier::CARRY_ON && - (*buffer)[position_buffer - 1] == '\n') { - break; - } - idx_t position_set = position_buffer; - start_buffer = position_buffer; - // We check if we can add this line - // disable the projection pushdown while reading the first line - // otherwise the first line parsing can be influenced by which columns we are reading - auto column_ids = std::move(reader_data.column_ids); - auto column_mapping = std::move(reader_data.column_mapping); - InitializeProjection(); - try { - successfully_read_first_line = TryParseSimpleCSV(first_line_chunk, error_message, true); - } catch (...) { - successfully_read_first_line = false; - } - // restore the projection pushdown - reader_data.column_ids = std::move(column_ids); - reader_data.column_mapping = std::move(column_mapping); - end_buffer = end_buffer_real; - start_buffer = position_set; - if (position_buffer >= end_buffer) { - if (successfully_read_first_line) { - position_buffer = position_set; - } - break; - } - position_buffer = position_set; - } - if (verification_positions.beginning_of_first_line == 0) { - verification_positions.beginning_of_first_line = position_buffer; - } - // Ensure that parse_chunk has no gunk when trying to figure new line - parse_chunk.Reset(); - - verification_positions.end_of_last_line = position_buffer; - finished = false; - return successfully_read_first_line; -} - -void ParallelCSVReader::SetBufferRead(unique_ptr buffer_read_p) { - if (!buffer_read_p->buffer) { - throw InternalException("ParallelCSVReader::SetBufferRead - CSVBufferRead does not have a buffer to read"); - } - position_buffer = buffer_read_p->buffer_start; - start_buffer = buffer_read_p->buffer_start; - end_buffer = buffer_read_p->buffer_end; - if (buffer_read_p->next_buffer) { - buffer_size = buffer_read_p->buffer->GetBufferSize() + buffer_read_p->next_buffer->GetBufferSize(); - } else { - buffer_size = buffer_read_p->buffer->GetBufferSize(); - } - buffer = std::move(buffer_read_p); - - reached_remainder_state = false; - verification_positions.beginning_of_first_line = 0; - verification_positions.end_of_last_line = 0; - finished = false; - D_ASSERT(end_buffer <= buffer_size); -} - -VerificationPositions ParallelCSVReader::GetVerificationPositions() { - verification_positions.beginning_of_first_line += buffer->buffer->GetCSVGlobalStart(); - verification_positions.end_of_last_line += buffer->buffer->GetCSVGlobalStart(); - return verification_positions; -} - -// If BufferRemainder returns false, it means we are done scanning this buffer and should go to the end_state -bool ParallelCSVReader::BufferRemainder() { - if (position_buffer >= end_buffer && !reached_remainder_state) { - // First time we finish the buffer piece we should scan here, we set the variables - // to allow this piece to be scanned up to the end of the buffer or the next new line - reached_remainder_state = true; - // end_buffer is allowed to go to buffer size to finish its last line - end_buffer = buffer_size; - } - if (position_buffer >= end_buffer) { - // buffer ends, return false - return false; - } - // we can still scan stuff, return true - return true; -} - -void ParallelCSVReader::VerifyLineLength(idx_t line_size) { - if (line_size > options.maximum_line_size) { - throw InvalidInputException("Error in file \"%s\" on line %s: Maximum line size of %llu bytes exceeded!", - options.file_path, - GetLineNumberStr(parse_chunk.size(), linenr_estimated, buffer->batch_index).c_str(), - options.maximum_line_size); - } -} - -bool AllNewLine(string_t value, idx_t column_amount) { - auto value_str = value.GetString(); - if (value_str.empty() && column_amount == 1) { - // This is a one column (empty) - return false; - } - for (idx_t i = 0; i < value.GetSize(); i++) { - if (!StringUtil::CharacterIsNewline(value_str[i])) { - return false; - } - } - return true; -} - -bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) { - // If line is not set, we have to figure it out, we assume whatever is in the first line - if (options.new_line == NewLineIdentifier::NOT_SET) { - idx_t cur_pos = position_buffer; - // we can start in the middle of a new line, so move a bit forward. - while (cur_pos < end_buffer) { - if (StringUtil::CharacterIsNewline((*buffer)[cur_pos])) { - cur_pos++; - } else { - break; - } - } - for (; cur_pos < end_buffer; cur_pos++) { - if (StringUtil::CharacterIsNewline((*buffer)[cur_pos])) { - bool carriage_return = (*buffer)[cur_pos] == '\r'; - bool carriage_return_followed = false; - cur_pos++; - if (cur_pos < end_buffer) { - if (carriage_return && (*buffer)[cur_pos] == '\n') { - carriage_return_followed = true; - cur_pos++; - } - } - SetNewLineDelimiter(carriage_return, carriage_return_followed); - break; - } - } - } - // used for parsing algorithm - if (start_buffer == buffer_size) { - // Nothing to read - finished = true; - return true; - } - D_ASSERT(end_buffer <= buffer_size); - bool finished_chunk = false; - idx_t column = 0; - idx_t offset = 0; - bool has_quotes = false; - - vector escape_positions; - if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) { - // First time reading this buffer piece - if (!SetPosition()) { - finished = true; - return true; - } - } - if (position_buffer == buffer_size) { - // Nothing to read - finished = true; - return true; - } - // Keep track of line size - idx_t line_start = position_buffer; - // start parsing the first value - goto value_start; - -value_start : { - /* state: value_start */ - if (!BufferRemainder()) { - goto final_state; - } - offset = 0; - - // this state parses the first character of a value - if ((*buffer)[position_buffer] == options.quote[0]) { - // quote: actual value starts in the next position - // move to in_quotes state - start_buffer = position_buffer + 1; - goto in_quotes; - } else { - // no quote, move to normal parsing state - start_buffer = position_buffer; - goto normal; - } -}; - -normal : { - /* state: normal parsing state */ - // this state parses the remainder of a non-quoted value until we reach a delimiter or newline - for (; position_buffer < end_buffer; position_buffer++) { - auto c = (*buffer)[position_buffer]; - if (c == options.delimiter[0]) { - // delimiter: end the value and add it to the chunk - goto add_value; - } else if (c == options.quote[0] && try_add_line) { - return false; - } else if (StringUtil::CharacterIsNewline(c)) { - // newline: add row - if (column > 0 || try_add_line || parse_chunk.data.size() == 1) { - goto add_row; - } - if (column == 0 && position_buffer == start_buffer) { - start_buffer++; - } - } - } - if (!BufferRemainder()) { - goto final_state; - } else { - goto normal; - } -}; - -add_value : { - /* state: Add value to string vector */ - AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes, - buffer->local_batch_index); - // increase position by 1 and move start to the new position - offset = 0; - has_quotes = false; - start_buffer = ++position_buffer; - if (!BufferRemainder()) { - goto final_state; - } - goto value_start; -}; - -add_row : { - /* state: Add Row to Parse chunk */ - // check type of newline (\r or \n) - bool carriage_return = (*buffer)[position_buffer] == '\r'; - - AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes, - buffer->local_batch_index); - if (try_add_line) { - bool success = column == insert_chunk.ColumnCount(); - if (success) { - idx_t cur_linenr = linenr; - AddRow(insert_chunk, column, error_message, buffer->local_batch_index); - success = Flush(insert_chunk, buffer->local_batch_index, true); - linenr = cur_linenr; - } - reached_remainder_state = false; - parse_chunk.Reset(); - return success; - } else { - VerifyLineLength(position_buffer - line_start); - line_start = position_buffer; - finished_chunk = AddRow(insert_chunk, column, error_message, buffer->local_batch_index); - } - // increase position by 1 and move start to the new position - offset = 0; - has_quotes = false; - position_buffer++; - start_buffer = position_buffer; - verification_positions.end_of_last_line = position_buffer; - if (carriage_return) { - // \r newline, go to special state that parses an optional \n afterwards - // optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line - if (!BufferRemainder()) { - goto final_state; - } - if ((*buffer)[position_buffer] == '\n') { - if (options.new_line == NewLineIdentifier::SINGLE) { - error_message = "Wrong NewLine Identifier. Expecting \\r\\n"; - return false; - } - // newline after carriage return: skip - // increase position by 1 and move start to the new position - start_buffer = ++position_buffer; - - SkipEmptyLines(); - verification_positions.end_of_last_line = position_buffer; - start_buffer = position_buffer; - if (reached_remainder_state) { - goto final_state; - } - } else { - if (options.new_line == NewLineIdentifier::CARRY_ON) { - error_message = "Wrong NewLine Identifier. Expecting \\r or \\n"; - return false; - } - } - if (!BufferRemainder()) { - goto final_state; - } - if (reached_remainder_state || finished_chunk) { - goto final_state; - } - goto value_start; - } else { - if (options.new_line == NewLineIdentifier::CARRY_ON) { - error_message = "Wrong NewLine Identifier. Expecting \\r or \\n"; - return false; - } - if (reached_remainder_state) { - goto final_state; - } - if (!BufferRemainder()) { - goto final_state; - } - SkipEmptyLines(); - verification_positions.end_of_last_line = position_buffer; - start_buffer = position_buffer; - // \n newline, move to value start - if (finished_chunk) { - goto final_state; - } - goto value_start; - } -} -in_quotes: - /* state: in_quotes this state parses the remainder of a quoted value*/ - has_quotes = true; - position_buffer++; - for (; position_buffer < end_buffer; position_buffer++) { - auto c = (*buffer)[position_buffer]; - if (c == options.quote[0]) { - // quote: move to unquoted state - goto unquote; - } else if (c == options.escape[0]) { - // escape: store the escaped position and move to handle_escape state - escape_positions.push_back(position_buffer - start_buffer); - goto handle_escape; - } - } - if (!BufferRemainder()) { - if (buffer->buffer->IsCSVFileLastBuffer()) { - if (try_add_line) { - return false; - } - // still in quoted state at the end of the file or at the end of a buffer when running multithreaded, error: - throw InvalidInputException("Error in file \"%s\" on line %s: unterminated quotes. (%s)", options.file_path, - GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), - options.ToString()); - } else { - goto final_state; - } - } else { - position_buffer--; - goto in_quotes; - } - -unquote : { - /* state: unquote: this state handles the state directly after we unquote*/ - // - // in this state we expect either another quote (entering the quoted state again, and escaping the quote) - // or a delimiter/newline, ending the current value and moving on to the next value - position_buffer++; - if (!BufferRemainder()) { - offset = 1; - goto final_state; - } - auto c = (*buffer)[position_buffer]; - if (c == options.quote[0] && (options.escape.empty() || options.escape[0] == options.quote[0])) { - // escaped quote, return to quoted state and store escape position - escape_positions.push_back(position_buffer - start_buffer); - goto in_quotes; - } else if (c == options.delimiter[0]) { - // delimiter, add value - offset = 1; - goto add_value; - } else if (StringUtil::CharacterIsNewline(c)) { - offset = 1; - // FIXME: should this be an assertion? - D_ASSERT(try_add_line || (!try_add_line && column == parse_chunk.ColumnCount() - 1)); - goto add_row; - } else if (position_buffer >= end_buffer) { - // reached end of buffer - offset = 1; - goto final_state; - } else { - error_message = StringUtil::Format( - "Error in file \"%s\" on line %s: quote should be followed by end of value, end of " - "row or another quote. (%s). ", - options.file_path, GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), - options.ToString()); - return false; - } -} -handle_escape : { - /* state: handle_escape */ - // escape should be followed by a quote or another escape character - position_buffer++; - if (!BufferRemainder()) { - goto final_state; - } - if (position_buffer >= buffer_size && buffer->buffer->IsCSVFileLastBuffer()) { - error_message = StringUtil::Format( - "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path, - GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), options.ToString()); - return false; - } - if ((*buffer)[position_buffer] != options.quote[0] && (*buffer)[position_buffer] != options.escape[0]) { - error_message = StringUtil::Format( - "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path, - GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), options.ToString()); - return false; - } - // escape was followed by quote or escape, go back to quoted state - goto in_quotes; -} -final_state : { - /* state: final_stage reached after we finished reading the end_buffer of the csv buffer */ - // reset end buffer - end_buffer = buffer->buffer_end; - if (position_buffer == end_buffer) { - reached_remainder_state = false; - } - if (finished_chunk) { - if (position_buffer >= end_buffer) { - if (position_buffer == end_buffer && StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1]) && - position_buffer < buffer_size) { - // last position is a new line, we still have to go through one more line of this buffer - finished = false; - } else { - finished = true; - } - } - buffer->lines_read += insert_chunk.size(); - return true; - } - // If this is the last buffer, we have to read the last value - if (buffer->buffer->IsCSVFileLastBuffer() || (buffer->next_buffer && buffer->next_buffer->IsCSVFileLastBuffer())) { - if (column > 0 || start_buffer != position_buffer || try_add_line || - (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) { - // remaining values to be added to the chunk - auto str_value = buffer->GetValue(start_buffer, position_buffer, offset); - if (!AllNewLine(str_value, insert_chunk.data.size()) || offset == 0) { - AddValue(str_value, column, escape_positions, has_quotes, buffer->local_batch_index); - if (try_add_line) { - bool success = column == return_types.size(); - if (success) { - auto cur_linenr = linenr; - AddRow(insert_chunk, column, error_message, buffer->local_batch_index); - success = Flush(insert_chunk, buffer->local_batch_index); - linenr = cur_linenr; - } - parse_chunk.Reset(); - reached_remainder_state = false; - return success; - } else { - VerifyLineLength(position_buffer - line_start); - line_start = position_buffer; - AddRow(insert_chunk, column, error_message, buffer->local_batch_index); - verification_positions.end_of_last_line = position_buffer; - } - } - } - } - // flush the parsed chunk and finalize parsing - if (mode == ParserMode::PARSING) { - Flush(insert_chunk, buffer->local_batch_index); - buffer->lines_read += insert_chunk.size(); - } - if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) { - error_message = "Line does not fit in one buffer. Increase the buffer size."; - return false; - } - end_buffer = buffer_size; - SkipEmptyLines(); - end_buffer = buffer->buffer_end; - verification_positions.end_of_last_line = position_buffer; - if (position_buffer >= end_buffer) { - if (position_buffer >= end_buffer) { - if (position_buffer == end_buffer && StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1]) && - position_buffer < buffer_size) { - // last position is a new line, we still have to go through one more line of this buffer - finished = false; - } else { - finished = true; - } - } - } - return true; -}; -} - -void ParallelCSVReader::ParseCSV(DataChunk &insert_chunk) { - string error_message; - if (!TryParseCSV(ParserMode::PARSING, insert_chunk, error_message)) { - throw InvalidInputException(error_message); - } -} - -idx_t ParallelCSVReader::GetLineError(idx_t line_error, idx_t buffer_idx, bool stop_at_first) { - while (true) { - if (buffer->line_info->CanItGetLine(file_idx, buffer_idx)) { - auto cur_start = verification_positions.beginning_of_first_line + buffer->buffer->GetCSVGlobalStart(); - return buffer->line_info->GetLine(buffer_idx, line_error, file_idx, cur_start, false, stop_at_first); - } - } -} - -bool ParallelCSVReader::TryParseCSV(ParserMode mode) { - DataChunk dummy_chunk; - string error_message; - return TryParseCSV(mode, dummy_chunk, error_message); -} - -void ParallelCSVReader::ParseCSV(ParserMode mode) { - DataChunk dummy_chunk; - string error_message; - if (!TryParseCSV(mode, dummy_chunk, error_message)) { - throw InvalidInputException(error_message); - } -} - -bool ParallelCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_chunk, string &error_message) { - mode = parser_mode; - return TryParseSimpleCSV(insert_chunk, error_message); -} - -} // namespace duckdb diff --git a/src/duckdb/src/execution/operator/schema/physical_create_index.cpp b/src/duckdb/src/execution/operator/schema/physical_create_index.cpp deleted file mode 100644 index f1e3b533a..000000000 --- a/src/duckdb/src/execution/operator/schema/physical_create_index.cpp +++ /dev/null @@ -1,193 +0,0 @@ -#include "duckdb/execution/operator/schema/physical_create_index.hpp" - -#include "duckdb/catalog/catalog_entry/duck_table_entry.hpp" -#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" -#include "duckdb/catalog/catalog_entry/duck_index_entry.hpp" -#include "duckdb/main/client_context.hpp" -#include "duckdb/storage/storage_manager.hpp" -#include "duckdb/main/database_manager.hpp" -#include "duckdb/execution/index/art/art_key.hpp" -#include "duckdb/execution/index/art/node.hpp" -#include "duckdb/execution/index/art/leaf.hpp" - -namespace duckdb { - -PhysicalCreateIndex::PhysicalCreateIndex(LogicalOperator &op, TableCatalogEntry &table_p, - const vector &column_ids, unique_ptr info, - vector> unbound_expressions, - idx_t estimated_cardinality) - : PhysicalOperator(PhysicalOperatorType::CREATE_INDEX, op.types, estimated_cardinality), - table(table_p.Cast()), info(std::move(info)), - unbound_expressions(std::move(unbound_expressions)) { - // convert virtual column ids to storage column ids - for (auto &column_id : column_ids) { - storage_ids.push_back(table.GetColumns().LogicalToPhysical(LogicalIndex(column_id)).index); - } -} - -//===--------------------------------------------------------------------===// -// Sink -//===--------------------------------------------------------------------===// - -class CreateIndexGlobalSinkState : public GlobalSinkState { -public: - //! Global index to be added to the table - unique_ptr global_index; -}; - -class CreateIndexLocalSinkState : public LocalSinkState { -public: - explicit CreateIndexLocalSinkState(ClientContext &context) : arena_allocator(Allocator::Get(context)) {}; - - unique_ptr local_index; - ArenaAllocator arena_allocator; - vector keys; - DataChunk key_chunk; - vector key_column_ids; -}; - -unique_ptr PhysicalCreateIndex::GetGlobalSinkState(ClientContext &context) const { - auto state = make_uniq(); - - // create the global index - switch (info->index_type) { - case IndexType::ART: { - auto &storage = table.GetStorage(); - state->global_index = make_uniq(storage_ids, TableIOManager::Get(storage), unbound_expressions, - info->constraint_type, storage.db); - break; - } - default: - throw InternalException("Unimplemented index type"); - } - return (std::move(state)); -} - -unique_ptr PhysicalCreateIndex::GetLocalSinkState(ExecutionContext &context) const { - auto state = make_uniq(context.client); - - // create the local index - switch (info->index_type) { - case IndexType::ART: { - auto &storage = table.GetStorage(); - state->local_index = make_uniq(storage_ids, TableIOManager::Get(storage), unbound_expressions, - info->constraint_type, storage.db); - break; - } - default: - throw InternalException("Unimplemented index type"); - } - state->keys = vector(STANDARD_VECTOR_SIZE); - state->key_chunk.Initialize(Allocator::Get(context.client), state->local_index->logical_types); - - for (idx_t i = 0; i < state->key_chunk.ColumnCount(); i++) { - state->key_column_ids.push_back(i); - } - return std::move(state); -} - -SinkResultType PhysicalCreateIndex::Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input) const { - - D_ASSERT(chunk.ColumnCount() >= 2); - auto &lstate = input.local_state.Cast(); - auto &row_identifiers = chunk.data[chunk.ColumnCount() - 1]; - - // generate the keys for the given input - lstate.key_chunk.ReferenceColumns(chunk, lstate.key_column_ids); - lstate.arena_allocator.Reset(); - ART::GenerateKeys(lstate.arena_allocator, lstate.key_chunk, lstate.keys); - - auto &storage = table.GetStorage(); - auto art = make_uniq(lstate.local_index->column_ids, lstate.local_index->table_io_manager, - lstate.local_index->unbound_expressions, lstate.local_index->constraint_type, storage.db); - if (!art->ConstructFromSorted(lstate.key_chunk.size(), lstate.keys, row_identifiers)) { - throw ConstraintException("Data contains duplicates on indexed column(s)"); - } - - // merge into the local ART - if (!lstate.local_index->MergeIndexes(*art)) { - throw ConstraintException("Data contains duplicates on indexed column(s)"); - } - -#ifdef DEBUG - // ensure that all row IDs of this chunk exist in the ART - auto row_ids = FlatVector::GetData(row_identifiers); - for (idx_t i = 0; i < lstate.key_chunk.size(); i++) { - auto leaf_node = - lstate.local_index->Cast().Lookup(*lstate.local_index->Cast().tree, lstate.keys[i], 0); - D_ASSERT(leaf_node.IsSet()); - auto &leaf = Leaf::Get(lstate.local_index->Cast(), leaf_node); - - if (leaf.IsInlined()) { - D_ASSERT(row_ids[i] == leaf.row_ids.inlined); - continue; - } - - D_ASSERT(leaf.row_ids.ptr.IsSet()); - Node leaf_segment = leaf.row_ids.ptr; - auto position = leaf.FindRowId(lstate.local_index->Cast(), leaf_segment, row_ids[i]); - D_ASSERT(position != (uint32_t)DConstants::INVALID_INDEX); - } -#endif - - return SinkResultType::NEED_MORE_INPUT; -} - -void PhysicalCreateIndex::Combine(ExecutionContext &context, GlobalSinkState &gstate_p, - LocalSinkState &lstate_p) const { - - auto &gstate = gstate_p.Cast(); - auto &lstate = lstate_p.Cast(); - - // merge the local index into the global index - if (!gstate.global_index->MergeIndexes(*lstate.local_index)) { - throw ConstraintException("Data contains duplicates on indexed column(s)"); - } - - // vacuum excess memory - gstate.global_index->Vacuum(); -} - -SinkFinalizeType PhysicalCreateIndex::Finalize(Pipeline &pipeline, Event &event, ClientContext &context, - GlobalSinkState &gstate_p) const { - - // here, we just set the resulting global index as the newly created index of the table - - auto &state = gstate_p.Cast(); - D_ASSERT(!state.global_index->VerifyAndToString(true).empty()); - - auto &storage = table.GetStorage(); - if (!storage.IsRoot()) { - throw TransactionException("Transaction conflict: cannot add an index to a table that has been altered!"); - } - - auto &schema = table.schema; - auto index_entry = schema.CreateIndex(context, *info, table).get(); - if (!index_entry) { - D_ASSERT(info->on_conflict == OnCreateConflict::IGNORE_ON_CONFLICT); - // index already exists, but error ignored because of IF NOT EXISTS - return SinkFinalizeType::READY; - } - auto &index = index_entry->Cast(); - - index.index = state.global_index.get(); - index.info = storage.info; - for (auto &parsed_expr : info->parsed_expressions) { - index.parsed_expressions.push_back(parsed_expr->Copy()); - } - - // add index to storage - storage.info->indexes.AddIndex(std::move(state.global_index)); - return SinkFinalizeType::READY; -} - -//===--------------------------------------------------------------------===// -// Source -//===--------------------------------------------------------------------===// - -SourceResultType PhysicalCreateIndex::GetData(ExecutionContext &context, DataChunk &chunk, - OperatorSourceInput &input) const { - return SourceResultType::FINISHED; -} - -} // namespace duckdb diff --git a/src/duckdb/src/execution/partitionable_hashtable.cpp b/src/duckdb/src/execution/partitionable_hashtable.cpp deleted file mode 100644 index 6042932ea..000000000 --- a/src/duckdb/src/execution/partitionable_hashtable.cpp +++ /dev/null @@ -1,207 +0,0 @@ -#include "duckdb/execution/partitionable_hashtable.hpp" - -#include "duckdb/common/radix_partitioning.hpp" - -namespace duckdb { - -RadixPartitionInfo::RadixPartitionInfo(const idx_t n_partitions_upper_bound) - : n_partitions(PreviousPowerOfTwo(n_partitions_upper_bound)), - radix_bits(RadixPartitioning::RadixBits(n_partitions)), radix_mask(RadixPartitioning::Mask(radix_bits)), - radix_shift(RadixPartitioning::Shift(radix_bits)) { - - D_ASSERT(radix_bits <= RadixPartitioning::MAX_RADIX_BITS); - D_ASSERT(n_partitions > 0); - D_ASSERT(n_partitions == RadixPartitioning::NumberOfPartitions(radix_bits)); - D_ASSERT(IsPowerOfTwo(n_partitions)); -} - -PartitionableHashTable::PartitionableHashTable(ClientContext &context, Allocator &allocator, - RadixPartitionInfo &partition_info_p, vector group_types_p, - vector payload_types_p, - vector bindings_p) - : context(context), allocator(allocator), group_types(std::move(group_types_p)), - payload_types(std::move(payload_types_p)), bindings(std::move(bindings_p)), is_partitioned(false), - partition_info(partition_info_p), hashes(LogicalType::HASH), hashes_subset(LogicalType::HASH) { - - sel_vectors.resize(partition_info.n_partitions); - sel_vector_sizes.resize(partition_info.n_partitions); - group_subset.Initialize(allocator, group_types); - if (!payload_types.empty()) { - payload_subset.Initialize(allocator, payload_types); - } - - for (hash_t r = 0; r < partition_info.n_partitions; r++) { - sel_vectors[r].Initialize(); - } - - RowLayout layout; - layout.Initialize(group_types, AggregateObject::CreateAggregateObjects(bindings)); - tuple_size = layout.GetRowWidth(); -} - -HtEntryType PartitionableHashTable::GetHTEntrySize() { - // we need at least STANDARD_VECTOR_SIZE entries to fit in the hash table - if (GroupedAggregateHashTable::GetMaxCapacity(HtEntryType::HT_WIDTH_32, tuple_size) < STANDARD_VECTOR_SIZE) { - return HtEntryType::HT_WIDTH_64; - } - return HtEntryType::HT_WIDTH_32; -} - -bool OverMemoryLimit(ClientContext &context, const bool is_partitioned, const RadixPartitionInfo &partition_info, - const GroupedAggregateHashTable &ht) { - const auto n_partitions = is_partitioned ? partition_info.n_partitions : 1; - const auto max_memory = BufferManager::GetBufferManager(context).GetMaxMemory(); - const auto num_threads = TaskScheduler::GetScheduler(context).NumberOfThreads(); - const auto memory_per_partition = 0.6 * max_memory / num_threads / n_partitions; - return ht.TotalSize() > memory_per_partition; -} - -idx_t PartitionableHashTable::ListAddChunk(HashTableList &list, DataChunk &groups, Vector &group_hashes, - DataChunk &payload, const unsafe_vector &filter) { - // If this is false, a single AddChunk would overflow the max capacity - D_ASSERT(list.empty() || groups.size() <= list.back()->MaxCapacity()); - if (list.empty() || list.back()->Count() + groups.size() >= list.back()->MaxCapacity() || - OverMemoryLimit(context, is_partitioned, partition_info, *list.back())) { - idx_t new_capacity = GroupedAggregateHashTable::InitialCapacity(); - if (!list.empty()) { - new_capacity = list.back()->Capacity(); - // early release first part of ht and prevent adding of more data - list.back()->Finalize(); - } - list.push_back(make_uniq(context, allocator, group_types, payload_types, bindings, - GetHTEntrySize(), new_capacity)); - } - return list.back()->AddChunk(append_state, groups, group_hashes, payload, filter); -} - -idx_t PartitionableHashTable::AddChunk(DataChunk &groups, DataChunk &payload, bool do_partition, - const unsafe_vector &filter) { - groups.Hash(hashes); - - // we partition when we are asked to or when the unpartitioned ht runs out of space - if (!IsPartitioned() && do_partition) { - Partition(false); - } - - if (!IsPartitioned()) { - return ListAddChunk(unpartitioned_hts, groups, hashes, payload, filter); - } - - // makes no sense to do this with 1 partition - D_ASSERT(partition_info.n_partitions > 0); - - for (hash_t r = 0; r < partition_info.n_partitions; r++) { - sel_vector_sizes[r] = 0; - } - - hashes.Flatten(groups.size()); - auto hashes_ptr = FlatVector::GetData(hashes); - - // Determine for every partition how much data will be sinked into it - for (idx_t i = 0; i < groups.size(); i++) { - auto partition = partition_info.GetHashPartition(hashes_ptr[i]); - D_ASSERT(partition < partition_info.n_partitions); - sel_vectors[partition].set_index(sel_vector_sizes[partition]++, i); - } - -#ifdef DEBUG - // make sure we have lost no rows - idx_t total_count = 0; - for (idx_t r = 0; r < partition_info.n_partitions; r++) { - total_count += sel_vector_sizes[r]; - } - D_ASSERT(total_count == groups.size()); -#endif - idx_t group_count = 0; - for (hash_t r = 0; r < partition_info.n_partitions; r++) { - group_subset.Slice(groups, sel_vectors[r], sel_vector_sizes[r]); - if (!payload_types.empty()) { - payload_subset.Slice(payload, sel_vectors[r], sel_vector_sizes[r]); - } else { - payload_subset.SetCardinality(sel_vector_sizes[r]); - } - hashes_subset.Slice(hashes, sel_vectors[r], sel_vector_sizes[r]); - - group_count += ListAddChunk(radix_partitioned_hts[r], group_subset, hashes_subset, payload_subset, filter); - } - return group_count; -} - -void PartitionableHashTable::Partition(bool sink_done) { - D_ASSERT(!IsPartitioned()); - D_ASSERT(radix_partitioned_hts.empty()); - D_ASSERT(partition_info.n_partitions > 1); - - vector partition_hts(partition_info.n_partitions); - radix_partitioned_hts.resize(partition_info.n_partitions); - for (auto &unpartitioned_ht : unpartitioned_hts) { - for (idx_t r = 0; r < partition_info.n_partitions; r++) { - radix_partitioned_hts[r].push_back(make_uniq( - context, allocator, group_types, payload_types, bindings, GetHTEntrySize())); - partition_hts[r] = radix_partitioned_hts[r].back().get(); - } - unpartitioned_ht->Partition(partition_hts, partition_info.radix_bits, sink_done); - unpartitioned_ht.reset(); - } - unpartitioned_hts.clear(); - is_partitioned = true; -} - -bool PartitionableHashTable::IsPartitioned() { - return is_partitioned; -} - -HashTableList PartitionableHashTable::GetPartition(idx_t partition) { - D_ASSERT(IsPartitioned()); - D_ASSERT(partition < partition_info.n_partitions); - D_ASSERT(radix_partitioned_hts.size() > partition); - return std::move(radix_partitioned_hts[partition]); -} - -HashTableList PartitionableHashTable::GetUnpartitioned() { - D_ASSERT(!IsPartitioned()); - return std::move(unpartitioned_hts); -} - -idx_t PartitionableHashTable::GetPartitionCount(idx_t partition) const { - idx_t total_size = 0; - for (const auto &ht : radix_partitioned_hts[partition]) { - total_size += ht->Count(); - } - return total_size; -} - -idx_t PartitionableHashTable::GetPartitionSize(idx_t partition) const { - idx_t total_size = 0; - for (const auto &ht : radix_partitioned_hts[partition]) { - total_size += ht->DataSize(); - } - return total_size; -} - -void PartitionableHashTable::Finalize() { - if (IsPartitioned()) { - for (auto &ht_list : radix_partitioned_hts) { - for (auto &ht : ht_list) { - D_ASSERT(ht); - ht->Finalize(); - } - } - } else { - for (auto &ht : unpartitioned_hts) { - D_ASSERT(ht); - ht->Finalize(); - } - } -} - -void PartitionableHashTable::Append(GroupedAggregateHashTable &ht) { - if (unpartitioned_hts.empty()) { - unpartitioned_hts.push_back(make_uniq(context, allocator, group_types, payload_types, - bindings, GetHTEntrySize(), - GroupedAggregateHashTable::InitialCapacity())); - } - unpartitioned_hts.back()->Append(ht); -} - -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/common/arrow/arrow_options.hpp b/src/duckdb/src/include/duckdb/common/arrow/arrow_options.hpp deleted file mode 100644 index 9e9fee03a..000000000 --- a/src/duckdb/src/include/duckdb/common/arrow/arrow_options.hpp +++ /dev/null @@ -1,25 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/common/arrow/arrow_options.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -namespace duckdb { - -enum ArrowOffsetSize { REGULAR, LARGE }; - -struct ArrowOptions { - explicit ArrowOptions(ArrowOffsetSize offset_size_p) : offset_size(offset_size_p) { - } - ArrowOptions(ArrowOffsetSize offset_size_p, string timezone_p) : offset_size(offset_size_p), time_zone(timezone_p) { - } - ArrowOptions() { - } - ArrowOffsetSize offset_size = ArrowOffsetSize::REGULAR; - string time_zone = "UTC"; -}; -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/index/art/leaf_segment.hpp b/src/duckdb/src/include/duckdb/execution/index/art/leaf_segment.hpp deleted file mode 100644 index 29d146f43..000000000 --- a/src/duckdb/src/include/duckdb/execution/index/art/leaf_segment.hpp +++ /dev/null @@ -1,38 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/execution/index/art/leaf_segment.hpp -// -// -//===----------------------------------------------------------------------===// -#pragma once - -#include "duckdb/execution/index/art/art.hpp" -#include "duckdb/execution/index/art/node.hpp" - -namespace duckdb { - -class LeafSegment { -public: - //! The row IDs stored in this segment - row_t row_ids[Node::LEAF_SEGMENT_SIZE]; - //! The pointer of the next segment, if the row IDs exceeds this segment - Node next; - -public: - //! Get a new leaf segment node, might cause a new buffer allocation, and initialize it - static LeafSegment &New(ART &art, Node &node); - //! Get a reference to the leaf segment - static inline LeafSegment &Get(const ART &art, const Node ptr) { - return *Node::GetAllocator(art, NType::LEAF_SEGMENT).Get(ptr); - } - //! Free the leaf segment and any subsequent ones - static void Free(ART &art, Node &node); - - //! Append a row ID to the current segment, or create a new segment containing that row ID - LeafSegment &Append(ART &art, uint32_t &count, const row_t row_id); - //! Get the tail of a list of segments - LeafSegment &GetTail(const ART &art); -}; - -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/index/art/prefix_segment.hpp b/src/duckdb/src/include/duckdb/execution/index/art/prefix_segment.hpp deleted file mode 100644 index d41476abd..000000000 --- a/src/duckdb/src/include/duckdb/execution/index/art/prefix_segment.hpp +++ /dev/null @@ -1,40 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/execution/index/art/prefix_segment.hpp -// -// -//===----------------------------------------------------------------------===// -#pragma once - -#include "duckdb/execution/index/art/art.hpp" -#include "duckdb/execution/index/art/node.hpp" - -namespace duckdb { - -class PrefixSegment { -public: - //! Constructor of an empty prefix segment containing bytes. - //! NOTE: only use this constructor for temporary prefix segments - PrefixSegment() {}; - - //! The prefix bytes stored in this segment - uint8_t bytes[Node::PREFIX_SEGMENT_SIZE]; - //! The position of the next segment, if the prefix exceeds this segment - Node next; - -public: - //! Get a new prefix segment node, might cause a new buffer allocation, and initialize it - static PrefixSegment &New(ART &art, Node &node); - //! Get a reference to the prefix segment - static inline PrefixSegment &Get(const ART &art, const Node ptr) { - return *Node::GetAllocator(art, NType::PREFIX_SEGMENT).Get(ptr); - } - - //! Append a byte to the current segment, or create a new segment containing that byte - PrefixSegment &Append(ART &art, uint32_t &count, const uint8_t byte); - //! Get the tail of a list of segments - PrefixSegment &GetTail(const ART &art); -}; - -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/index/art/swizzleable_pointer.hpp b/src/duckdb/src/include/duckdb/execution/index/art/swizzleable_pointer.hpp deleted file mode 100644 index 66297265d..000000000 --- a/src/duckdb/src/include/duckdb/execution/index/art/swizzleable_pointer.hpp +++ /dev/null @@ -1,58 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/execution/index/art/swizzleable_pointer.hpp -// -// -//===----------------------------------------------------------------------===// -#pragma once - -#include "duckdb/common/constants.hpp" - -namespace duckdb { - -// classes -class MetaBlockReader; - -// structs -struct BlockPointer; - -//! SwizzleablePointer provides functions on a (possibly) swizzled pointer. If the swizzle flag is set, then the -//! pointer points to a storage address (and has no type), otherwise the pointer has a type and stores -//! other information (e.g., a buffer location) -class SwizzleablePointer { -public: - //! Constructs an empty SwizzleablePointer - SwizzleablePointer() : swizzle_flag(0), type(0), offset(0), buffer_id(0) {}; - //! Constructs a swizzled pointer from a buffer ID and an offset - explicit SwizzleablePointer(MetaBlockReader &reader); - //! Constructs a non-swizzled pointer from a buffer ID and an offset - SwizzleablePointer(uint32_t offset, uint32_t buffer_id) - : swizzle_flag(0), type(0), offset(offset), buffer_id(buffer_id) {}; - - //! The swizzle flag, set if swizzled, not set otherwise - uint8_t swizzle_flag : 1; - //! The type of the pointer, zero if not set - uint8_t type : 7; - //! The offset of a memory location - uint32_t offset : 24; - //! The buffer ID of a memory location - uint32_t buffer_id : 32; - -public: - //! Checks if the pointer is swizzled - inline bool IsSwizzled() const { - return swizzle_flag; - } - //! Returns true, if neither the swizzle flag nor the type is set, and false otherwise - inline bool IsSet() const { - return swizzle_flag || type; - } - //! Reset the pointer - inline void Reset() { - swizzle_flag = 0; - type = 0; - } -}; - -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/base_csv_reader.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/base_csv_reader.hpp deleted file mode 100644 index 57e7478a1..000000000 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/base_csv_reader.hpp +++ /dev/null @@ -1,119 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/execution/operator/persistent/base_csv_reader.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "duckdb/execution/physical_operator.hpp" -#include "duckdb/parser/parsed_data/copy_info.hpp" -#include "duckdb/function/scalar/strftime_format.hpp" -#include "duckdb/common/types/chunk_collection.hpp" -#include "duckdb/common/enums/file_compression_type.hpp" -#include "duckdb/common/map.hpp" -#include "duckdb/common/queue.hpp" -#include "duckdb/execution/operator/persistent/csv_reader_options.hpp" -#include "duckdb/common/multi_file_reader.hpp" -#include "duckdb/execution/operator/persistent/csv_line_info.hpp" - -#include - -namespace duckdb { -struct CopyInfo; -struct CSVFileHandle; -struct FileHandle; -struct StrpTimeFormat; - -class FileOpener; -class FileSystem; - -enum class ParserMode : uint8_t { PARSING = 0, SNIFFING_DIALECT = 1, SNIFFING_DATATYPES = 2, PARSING_HEADER = 3 }; - -//! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file -class BaseCSVReader { -public: - BaseCSVReader(ClientContext &context, BufferedCSVReaderOptions options, - const vector &requested_types = vector()); - virtual ~BaseCSVReader(); - - ClientContext &context; - FileSystem &fs; - Allocator &allocator; - BufferedCSVReaderOptions options; - vector return_types; - vector names; - MultiFileReaderData reader_data; - - idx_t linenr = 0; - bool linenr_estimated = false; - - bool row_empty = false; - idx_t sample_chunk_idx = 0; - bool jumping_samples = false; - bool end_of_file_reached = false; - bool bom_checked = false; - - idx_t bytes_in_chunk = 0; - double bytes_per_line_avg = 0; - - DataChunk parse_chunk; - - ParserMode mode; - -public: - const string &GetFileName() { - return options.file_path; - } - const vector &GetNames() { - return names; - } - const vector &GetTypes() { - return return_types; - } - - //! Get the 1-indexed global line number for the given local error line - virtual idx_t GetLineError(idx_t line_error, idx_t buffer_idx, bool stop_at_first = true) { - return line_error + 1; - }; - - //! Initialize projection indices to select all columns - void InitializeProjection(); - -protected: - //! Initializes the parse_chunk with varchar columns and aligns info with new number of cols - void InitParseChunk(idx_t num_cols); - //! Change the date format for the type to the string - void SetDateFormat(const string &format_specifier, const LogicalTypeId &sql_type); - //! Try to cast a string value to the specified sql type - bool TryCastValue(const Value &value, const LogicalType &sql_type); - //! Try to cast a vector of values to the specified sql type - bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type); - - //! Adds a value to the current row - void AddValue(string_t str_val, idx_t &column, vector &escape_positions, bool has_quotes, - idx_t buffer_idx = 0); - //! Adds a row to the insert_chunk, returns true if the chunk is filled as a result of this row being added - bool AddRow(DataChunk &insert_chunk, idx_t &column, string &error_message, idx_t buffer_idx = 0); - //! Finalizes a chunk, parsing all values that have been added so far and adding them to the insert_chunk - bool Flush(DataChunk &insert_chunk, idx_t buffer_idx = 0, bool try_add_line = false); - - unique_ptr OpenCSV(const BufferedCSVReaderOptions &options); - - void VerifyUTF8(idx_t col_idx); - void VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset = 0); - string GetLineNumberStr(idx_t linenr, bool linenr_estimated, idx_t buffer_idx = 0); - - //! Sets the newline delimiter - void SetNewLineDelimiter(bool carry = false, bool carry_followed_by_nl = false); - -protected: - //! Whether or not the current row's columns have overflown return_types.size() - bool error_column_overflow = false; - //! Number of sniffed columns - only used when auto-detecting - vector sniffed_column_counts; -}; - -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp deleted file mode 100644 index b2a9ba00d..000000000 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +++ /dev/null @@ -1,133 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/execution/operator/persistent/base_csv_reader.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "duckdb/execution/operator/persistent/base_csv_reader.hpp" - -namespace duckdb { -struct CopyInfo; -struct CSVFileHandle; -struct FileHandle; -struct StrpTimeFormat; - -class FileOpener; -class FileSystem; - -//! The shifts array allows for linear searching of multi-byte values. For each position, it determines the next -//! position given that we encounter a byte with the given value. -/*! For example, if we have a string "ABAC", the shifts array will have the following values: - * [0] --> ['A'] = 1, all others = 0 - * [1] --> ['B'] = 2, ['A'] = 1, all others = 0 - * [2] --> ['A'] = 3, all others = 0 - * [3] --> ['C'] = 4 (match), 'B' = 2, 'A' = 1, all others = 0 - * Suppose we then search in the following string "ABABAC", our progression will be as follows: - * 'A' -> [1], 'B' -> [2], 'A' -> [3], 'B' -> [2], 'A' -> [3], 'C' -> [4] (match!) - */ -struct TextSearchShiftArray { - TextSearchShiftArray(); - explicit TextSearchShiftArray(string search_term); - - inline bool Match(uint8_t &position, uint8_t byte_value) { - if (position >= length) { - return false; - } - position = shifts[position * 255 + byte_value]; - return position == length; - } - - idx_t length; - unique_ptr shifts; -}; - -//! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file -class BufferedCSVReader : public BaseCSVReader { - //! Initial buffer read size; can be extended for long lines - static constexpr idx_t INITIAL_BUFFER_SIZE = 16384; - //! Larger buffer size for non disk files - static constexpr idx_t INITIAL_BUFFER_SIZE_LARGE = 10000000; // 10MB - -public: - BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options, - const vector &requested_types = vector()); - BufferedCSVReader(ClientContext &context, string filename, BufferedCSVReaderOptions options, - const vector &requested_types = vector()); - virtual ~BufferedCSVReader() { - } - - unsafe_unique_array buffer; - idx_t buffer_size; - idx_t position; - idx_t start = 0; - - vector> cached_buffers; - - unique_ptr file_handle; - - TextSearchShiftArray delimiter_search, escape_search, quote_search; - -public: - //! Extract a single DataChunk from the CSV file and stores it in insert_chunk - void ParseCSV(DataChunk &insert_chunk); - static string ColumnTypesError(case_insensitive_map_t sql_types_per_column, const vector &names); - -private: - //! Initialize Parser - void Initialize(const vector &requested_types); - //! Skips skip_rows, reads header row from input stream - void SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header); - //! Jumps back to the beginning of input stream and resets necessary internal states - void JumpToBeginning(idx_t skip_rows, bool skip_header); - //! Resets the buffer - void ResetBuffer(); - //! Resets the steam - void ResetStream(); - //! Reads a new buffer from the CSV file if the current one has been exhausted - bool ReadBuffer(idx_t &start, idx_t &line_start); - //! Jumps back to the beginning of input stream and resets necessary internal states - bool JumpToNextSample(); - //! Initializes the TextSearchShiftArrays for complex parser - void PrepareComplexParser(); - //! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong. - void ParseCSV(ParserMode mode); - //! Try to parse a single datachunk from the file. Returns whether or not the parsing is successful - bool TryParseCSV(ParserMode mode); - //! Extract a single DataChunk from the CSV file and stores it in insert_chunk - bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message); - - //! Parses a CSV file with a one-byte delimiter, escape and quote character - bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message); - //! Parses more complex CSV files with multi-byte delimiters, escapes or quotes - bool TryParseComplexCSV(DataChunk &insert_chunk, string &error_message); - //! Sniffs CSV dialect and determines skip rows, header row, column types and column names - vector SniffCSV(const vector &requested_types); - - //! First phase of auto detection: detect CSV dialect (i.e. delimiter, quote rules, etc) - void DetectDialect(const vector &requested_types, BufferedCSVReaderOptions &original_options, - vector &info_candidates, idx_t &best_num_cols); - //! Second phase of auto detection: detect candidate types for each column - void DetectCandidateTypes(const vector &type_candidates, - const map> &format_template_candidates, - const vector &info_candidates, - BufferedCSVReaderOptions &original_options, idx_t best_num_cols, - vector> &best_sql_types_candidates, - std::map> &best_format_candidates, - DataChunk &best_header_row); - //! Third phase of auto detection: detect header of CSV file - void DetectHeader(const vector> &best_sql_types_candidates, const DataChunk &best_header_row); - //! Fourth phase of auto detection: refine the types of each column and select which types to use for each column - vector RefineTypeDetection(const vector &type_candidates, - const vector &requested_types, - vector> &best_sql_types_candidates, - map> &best_format_candidates); - - //! Skip Empty lines for tables with over one column - void SkipEmptyLines(); -}; - -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp deleted file mode 100644 index 532b4c629..000000000 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +++ /dev/null @@ -1,74 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/execution/operator/persistent/csv_buffer.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "duckdb/common/constants.hpp" -#include "duckdb/execution/operator/persistent/csv_file_handle.hpp" -#include "duckdb/storage/buffer_manager.hpp" - -namespace duckdb { - -class CSVBuffer { -public: - //! Colossal buffer size for multi-threading - static constexpr idx_t INITIAL_BUFFER_SIZE_COLOSSAL = 32000000; // 32MB - - //! Constructor for Initial Buffer - CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle, - idx_t &global_csv_current_position, idx_t file_number); - - //! Constructor for `Next()` Buffers - CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer, - idx_t global_csv_current_position, idx_t file_number); - - //! Creates a new buffer with the next part of the CSV File - unique_ptr Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position, - idx_t file_number); - - //! Gets the buffer actual size - idx_t GetBufferSize(); - - //! Gets the start position of the buffer, only relevant for the first time it's scanned - idx_t GetStart(); - - //! If this buffer is the last buffer of the CSV File - bool IsCSVFileLastBuffer(); - - //! If this buffer is the first buffer of the CSV File - bool IsCSVFileFirstBuffer(); - - idx_t GetCSVGlobalStart(); - - idx_t GetFileNumber(); - - BufferHandle AllocateBuffer(idx_t buffer_size); - - char *Ptr() { - return char_ptr_cast(handle.Ptr()); - } - -private: - ClientContext &context; - - BufferHandle handle; - //! Actual size can be smaller than the buffer size in case we allocate it too optimistically. - idx_t actual_size; - //! We need to check for Byte Order Mark, to define the start position of this buffer - //! https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8 - idx_t start_position = 0; - //! If this is the last buffer of the CSV File - bool last_buffer = false; - //! If this is the first buffer of the CSV File - bool first_buffer = false; - //! Global position from the CSV File where this buffer starts - idx_t global_csv_start = 0; - //! Number of the file that is in this buffer - idx_t file_number = 0; -}; -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_file_handle.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_file_handle.hpp deleted file mode 100644 index d24f491be..000000000 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_file_handle.hpp +++ /dev/null @@ -1,66 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/execution/operator/persistent/csv_file_handle.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "duckdb/common/file_system.hpp" -#include "duckdb/common/mutex.hpp" -#include "duckdb/common/helper.hpp" -#include "duckdb/common/allocator.hpp" - -namespace duckdb { -class Allocator; -class FileSystem; - -struct CSVFileHandle { -public: - CSVFileHandle(FileSystem &fs, Allocator &allocator, unique_ptr file_handle_p, const string &path_p, - FileCompressionType compression, bool enable_reset = true); - - mutex main_mutex; - -public: - bool CanSeek(); - void Seek(idx_t position); - idx_t SeekPosition(); - void Reset(); - bool OnDiskFile(); - - idx_t FileSize(); - - bool FinishedReading(); - - idx_t Read(void *buffer, idx_t nr_bytes); - - string ReadLine(); - void DisableReset(); - - static unique_ptr OpenFileHandle(FileSystem &fs, Allocator &allocator, const string &path, - FileCompressionType compression); - static unique_ptr OpenFile(FileSystem &fs, Allocator &allocator, const string &path, - FileCompressionType compression, bool enable_reset); - -private: - FileSystem &fs; - Allocator &allocator; - unique_ptr file_handle; - string path; - FileCompressionType compression; - bool reset_enabled = true; - bool can_seek = false; - bool on_disk_file = false; - idx_t file_size = 0; - // reset support - AllocatedData cached_buffer; - idx_t read_position = 0; - idx_t buffer_size = 0; - idx_t buffer_capacity = 0; - idx_t requested_bytes = 0; -}; - -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_line_info.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_line_info.hpp deleted file mode 100644 index 7b4805bcf..000000000 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_line_info.hpp +++ /dev/null @@ -1,42 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/execution/operator/persistent/csv_line_info.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -namespace duckdb { -struct LineInfo { -public: - explicit LineInfo(mutex &main_mutex_p, vector> &batch_to_tuple_end_p, - vector> &tuple_start_p, vector> &tuple_end_p) - : main_mutex(main_mutex_p), batch_to_tuple_end(batch_to_tuple_end_p), tuple_start(tuple_start_p), - tuple_end(tuple_end_p) {}; - bool CanItGetLine(idx_t file_idx, idx_t batch_idx); - - //! Return the 1-indexed line number - idx_t GetLine(idx_t batch_idx, idx_t line_error = 0, idx_t file_idx = 0, idx_t cur_start = 0, bool verify = true, - bool stop_at_first = true); - //! Verify if the CSV File was read correctly from [0,batch_idx] batches. - void Verify(idx_t file_idx, idx_t batch_idx, idx_t cur_first_pos); - //! Lines read per batch, > - vector> lines_read; - //! Set of batches that have been initialized but are not yet finished. - vector> current_batches; - //! Pointer to CSV Reader Mutex - mutex &main_mutex; - //! Pointer Batch to Tuple End - vector> &batch_to_tuple_end; - //! Pointer Batch to Tuple Start - vector> &tuple_start; - //! Pointer Batch to Tuple End - vector> &tuple_end; - //! If we already threw an exception on a previous thread. - bool done = false; - idx_t first_line = 0; -}; - -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp deleted file mode 100644 index e5f56809c..000000000 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +++ /dev/null @@ -1,173 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/execution/operator/persistent/csv_reader_options.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "duckdb/execution/operator/persistent/csv_buffer.hpp" -#include "duckdb/common/map.hpp" -#include "duckdb/function/scalar/strftime_format.hpp" -#include "duckdb/common/types/value.hpp" -#include "duckdb/common/field_writer.hpp" -#include "duckdb/common/case_insensitive_map.hpp" -#include "duckdb/common/types.hpp" -#include "duckdb/common/multi_file_reader_options.hpp" - -namespace duckdb { - -enum class NewLineIdentifier : uint8_t { - SINGLE = 1, // Either \r or \n - CARRY_ON = 2, // \r\n - MIX = 3, // Hippie-Land, can't run it multithreaded - NOT_SET = 4 -}; - -enum class ParallelMode { AUTOMATIC = 0, PARALLEL = 1, SINGLE_THREADED = 2 }; - -struct BufferedCSVReaderOptions { - //===--------------------------------------------------------------------===// - // CommonCSVOptions - //===--------------------------------------------------------------------===// - - //! Whether or not a delimiter was defined by the user - bool has_delimiter = false; - //! Delimiter to separate columns within each line - string delimiter = ","; - //! Whether or not a new_line was defined by the user - bool has_newline = false; - //! New Line separator - NewLineIdentifier new_line = NewLineIdentifier::NOT_SET; - //! Whether or not a quote was defined by the user - bool has_quote = false; - //! Quote used for columns that contain reserved characters, e.g., delimiter - string quote = "\""; - //! Whether or not an escape character was defined by the user - bool has_escape = false; - //! Escape character to escape quote character - string escape; - //! Whether or not a header information was given by the user - bool has_header = false; - //! Whether or not the file has a header line - bool header = false; - //! Whether or not we should ignore InvalidInput errors - bool ignore_errors = false; - //! Rejects table name - string rejects_table_name; - //! Rejects table entry limit (0 = no limit) - idx_t rejects_limit = 0; - //! Columns to use as recovery key for rejected rows when reading with ignore_errors = true - vector rejects_recovery_columns; - //! Index of the recovery columns - vector rejects_recovery_column_ids; - //! Expected number of columns - idx_t num_cols = 0; - //! Number of samples to buffer - idx_t buffer_sample_size = STANDARD_VECTOR_SIZE * 50; - //! Specifies the string that represents a null value - string null_str; - //! Whether file is compressed or not, and if so which compression type - //! AUTO_DETECT (default; infer from file extension) - FileCompressionType compression = FileCompressionType::AUTO_DETECT; - //! Option to convert quoted values to NULL values - bool allow_quoted_nulls = true; - - //===--------------------------------------------------------------------===// - // CSVAutoOptions - //===--------------------------------------------------------------------===// - //! SQL Type list mapping of name to SQL type index in sql_type_list - case_insensitive_map_t sql_types_per_column; - //! User-defined SQL type list - vector sql_type_list; - //! User-defined name list - vector name_list; - //! Types considered as candidates for auto detection ordered by descending specificity (~ from high to low) - vector auto_type_candidates = {LogicalType::VARCHAR, LogicalType::TIMESTAMP, LogicalType::DATE, - LogicalType::TIME, LogicalType::DOUBLE, LogicalType::BIGINT, - LogicalType::BOOLEAN, LogicalType::SQLNULL}; - - //===--------------------------------------------------------------------===// - // ReadCSVOptions - //===--------------------------------------------------------------------===// - - //! How many leading rows to skip - idx_t skip_rows = 0; - //! Whether or not the skip_rows is set by the user - bool skip_rows_set = false; - //! Maximum CSV line size: specified because if we reach this amount, we likely have wrong delimiters (default: 2MB) - //! note that this is the guaranteed line length that will succeed, longer lines may be accepted if slightly above - idx_t maximum_line_size = 2097152; - //! Whether or not header names shall be normalized - bool normalize_names = false; - //! True, if column with that index must skip null check - vector force_not_null; - //! Consider all columns to be of type varchar - bool all_varchar = false; - //! Size of sample chunk used for dialect and type detection - idx_t sample_chunk_size = STANDARD_VECTOR_SIZE; - //! Number of sample chunks used for type detection - idx_t sample_chunks = 10; - //! Whether or not to automatically detect dialect and datatypes - bool auto_detect = false; - //! The file path of the CSV file to read - string file_path; - //! Multi-file reader options - MultiFileReaderOptions file_options; - //! Buffer Size (Parallel Scan) - idx_t buffer_size = CSVBuffer::INITIAL_BUFFER_SIZE_COLOSSAL; - //! Decimal separator when reading as numeric - string decimal_separator = "."; - //! Whether or not to pad rows that do not have enough columns with NULL values - bool null_padding = false; - - //! If we are running the parallel version of the CSV Reader. In general, the system should always auto-detect - //! When it can't execute a parallel run before execution. However, there are (rather specific) situations where - //! setting up this manually might be important - ParallelMode parallel_mode; - //===--------------------------------------------------------------------===// - // WriteCSVOptions - //===--------------------------------------------------------------------===// - //! True, if column with that index must be quoted - vector force_quote; - //! Prefix/suffix/custom newline the entire file once (enables writing of files as JSON arrays) - string prefix; - string suffix; - string write_newline; - - //! The date format to use (if any is specified) - std::map date_format = {{LogicalTypeId::DATE, {}}, {LogicalTypeId::TIMESTAMP, {}}}; - //! The date format to use for writing (if any is specified) - std::map write_date_format = {{LogicalTypeId::DATE, {}}, - {LogicalTypeId::TIMESTAMP, {}}}; - //! Whether or not a type format is specified - std::map has_format = {{LogicalTypeId::DATE, false}, {LogicalTypeId::TIMESTAMP, false}}; - - void Serialize(FieldWriter &writer) const; - void Deserialize(FieldReader &reader); - void FormatSerialize(FormatSerializer &serializer) const; - static BufferedCSVReaderOptions FormatDeserialize(FormatDeserializer &deserializer); - - void SetCompression(const string &compression); - void SetHeader(bool has_header); - void SetEscape(const string &escape); - void SetQuote(const string "e); - void SetDelimiter(const string &delimiter); - - void SetNewline(const string &input); - //! Set an option that is supported by both reading and writing functions, called by - //! the SetReadOption and SetWriteOption methods - bool SetBaseOption(const string &loption, const Value &value); - - //! loption - lowercase string - //! set - argument(s) to the option - //! expected_names - names expected if the option is "columns" - void SetReadOption(const string &loption, const Value &value, vector &expected_names); - void SetWriteOption(const string &loption, const Value &value); - void SetDateFormat(LogicalTypeId type, const string &format, bool read_format); - - std::string ToString() const; -}; -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp deleted file mode 100644 index bdcdb8fbe..000000000 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp +++ /dev/null @@ -1,172 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/execution/operator/persistent/parallel_csv_reader.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "duckdb/execution/operator/persistent/base_csv_reader.hpp" -#include "duckdb/execution/operator/persistent/csv_reader_options.hpp" -#include "duckdb/execution/operator/persistent/csv_file_handle.hpp" -#include "duckdb/execution/operator/persistent/csv_buffer.hpp" -#include "duckdb/execution/operator/persistent/csv_line_info.hpp" - -#include -#include - -namespace duckdb { - -struct CSVBufferRead { - CSVBufferRead(shared_ptr buffer_p, idx_t buffer_start_p, idx_t buffer_end_p, idx_t batch_index, - idx_t local_batch_index_p, optional_ptr line_info_p) - : buffer(std::move(buffer_p)), line_info(line_info_p), buffer_start(buffer_start_p), buffer_end(buffer_end_p), - batch_index(batch_index), local_batch_index(local_batch_index_p) { - if (buffer) { - if (buffer_end > buffer->GetBufferSize()) { - buffer_end = buffer->GetBufferSize(); - } - } else { - buffer_start = 0; - buffer_end = 0; - } - } - - CSVBufferRead(shared_ptr buffer_p, shared_ptr nxt_buffer_p, idx_t buffer_start_p, - idx_t buffer_end_p, idx_t batch_index, idx_t local_batch_index, optional_ptr line_info_p) - : CSVBufferRead(std::move(buffer_p), buffer_start_p, buffer_end_p, batch_index, local_batch_index, - line_info_p) { - next_buffer = std::move(nxt_buffer_p); - } - - CSVBufferRead() : buffer_start(0), buffer_end(NumericLimits::Maximum()) {}; - - const char &operator[](size_t i) const { - if (i < buffer->GetBufferSize()) { - auto buffer_ptr = buffer->Ptr(); - return buffer_ptr[i]; - } - auto next_ptr = next_buffer->Ptr(); - return next_ptr[i - buffer->GetBufferSize()]; - } - - string_t GetValue(idx_t start_buffer, idx_t position_buffer, idx_t offset) { - idx_t length = position_buffer - start_buffer - offset; - // 1) It's all in the current buffer - if (start_buffer + length <= buffer->GetBufferSize()) { - auto buffer_ptr = buffer->Ptr(); - return string_t(buffer_ptr + start_buffer, length); - } else if (start_buffer >= buffer->GetBufferSize()) { - // 2) It's all in the next buffer - D_ASSERT(next_buffer); - D_ASSERT(next_buffer->GetBufferSize() >= length + (start_buffer - buffer->GetBufferSize())); - auto buffer_ptr = next_buffer->Ptr(); - return string_t(buffer_ptr + (start_buffer - buffer->GetBufferSize()), length); - } else { - // 3) It starts in the current buffer and ends in the next buffer - D_ASSERT(next_buffer); - auto intersection = make_unsafe_uniq_array(length); - idx_t cur_pos = 0; - auto buffer_ptr = buffer->Ptr(); - for (idx_t i = start_buffer; i < buffer->GetBufferSize(); i++) { - intersection[cur_pos++] = buffer_ptr[i]; - } - idx_t nxt_buffer_pos = 0; - auto next_buffer_ptr = next_buffer->Ptr(); - for (; cur_pos < length; cur_pos++) { - intersection[cur_pos] = next_buffer_ptr[nxt_buffer_pos++]; - } - intersections.emplace_back(std::move(intersection)); - return string_t(intersections.back().get(), length); - } - } - - shared_ptr buffer; - shared_ptr next_buffer; - vector> intersections; - optional_ptr line_info; - - idx_t buffer_start; - idx_t buffer_end; - idx_t batch_index; - idx_t local_batch_index; - idx_t lines_read = 0; -}; - -struct VerificationPositions { - idx_t beginning_of_first_line = 0; - idx_t end_of_last_line = 0; -}; - -//! CSV Reader for Parallel Reading -class ParallelCSVReader : public BaseCSVReader { -public: - ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options, unique_ptr buffer, - idx_t first_pos_first_buffer, const vector &requested_types, idx_t file_idx_p); - virtual ~ParallelCSVReader() { - } - - //! Current Position (Relative to the Buffer) - idx_t position_buffer = 0; - - //! Start of the piece of the buffer this thread should read - idx_t start_buffer = 0; - //! End of the piece of this buffer this thread should read - idx_t end_buffer = NumericLimits::Maximum(); - //! The actual buffer size - idx_t buffer_size = 0; - - //! If this flag is set, it means we are about to try to read our last row. - bool reached_remainder_state = false; - - bool finished = false; - - unique_ptr buffer; - - idx_t file_idx; - - VerificationPositions GetVerificationPositions(); - - //! Position of the first read line and last read line for verification purposes - VerificationPositions verification_positions; - -public: - void SetBufferRead(unique_ptr buffer); - //! Extract a single DataChunk from the CSV file and stores it in insert_chunk - void ParseCSV(DataChunk &insert_chunk); - - idx_t GetLineError(idx_t line_error, idx_t buffer_idx, bool stop_at_first = true) override; - -private: - //! Initialize Parser - void Initialize(const vector &requested_types); - //! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong. - void ParseCSV(ParserMode mode); - //! Try to parse a single datachunk from the file. Returns whether or not the parsing is successful - bool TryParseCSV(ParserMode mode); - //! Extract a single DataChunk from the CSV file and stores it in insert_chunk - bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message); - //! Sets Position depending on the byte_start of this thread - bool SetPosition(); - //! Called when scanning the 1st buffer, skips empty lines - void SkipEmptyLines(); - //! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer - //! Up to finding a new line. This function sets the buffer_end and marks a boolean variable - //! when changing the buffer end the first time. - //! It returns FALSE if the parser should jump to the final state of parsing or not - bool BufferRemainder(); - - bool NewLineDelimiter(bool carry, bool carry_followed_by_nl, bool first_char); - - //! Parses a CSV file with a one-byte delimiter, escape and quote character - bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false); - //! Verifies that the line length did not go over a pre-defined limit. - void VerifyLineLength(idx_t line_size); - - //! First Position of First Buffer - idx_t first_pos_first_buffer = 0; -}; - -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/operator/schema/physical_create_index.hpp b/src/duckdb/src/include/duckdb/execution/operator/schema/physical_create_index.hpp deleted file mode 100644 index 1323104c4..000000000 --- a/src/duckdb/src/include/duckdb/execution/operator/schema/physical_create_index.hpp +++ /dev/null @@ -1,67 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/execution/operator/schema/physical_create_index.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "duckdb/execution/physical_operator.hpp" -#include "duckdb/execution/index/art/art.hpp" -#include "duckdb/parser/parsed_data/create_index_info.hpp" - -#include "duckdb/storage/data_table.hpp" - -#include - -namespace duckdb { -class DuckTableEntry; - -//! Physical CREATE (UNIQUE) INDEX statement -class PhysicalCreateIndex : public PhysicalOperator { -public: - static constexpr const PhysicalOperatorType TYPE = PhysicalOperatorType::CREATE_INDEX; - -public: - PhysicalCreateIndex(LogicalOperator &op, TableCatalogEntry &table, const vector &column_ids, - unique_ptr info, vector> unbound_expressions, - idx_t estimated_cardinality); - - //! The table to create the index for - DuckTableEntry &table; - //! The list of column IDs required for the index - vector storage_ids; - //! Info for index creation - unique_ptr info; - //! Unbound expressions to be used in the optimizer - vector> unbound_expressions; - -public: - //! Source interface, NOP for this operator - SourceResultType GetData(ExecutionContext &context, DataChunk &chunk, OperatorSourceInput &input) const override; - - bool IsSource() const override { - return true; - } - -public: - //! Sink interface, thread-local sink states - unique_ptr GetLocalSinkState(ExecutionContext &context) const override; - //! Sink interface, global sink state - unique_ptr GetGlobalSinkState(ClientContext &context) const override; - - SinkResultType Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input) const override; - void Combine(ExecutionContext &context, GlobalSinkState &gstate_p, LocalSinkState &lstate_p) const override; - SinkFinalizeType Finalize(Pipeline &pipeline, Event &event, ClientContext &context, - GlobalSinkState &gstate) const override; - - bool IsSink() const override { - return true; - } - bool ParallelSink() const override { - return true; - } -}; -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp b/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp deleted file mode 100644 index 2cc296a4f..000000000 --- a/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +++ /dev/null @@ -1,73 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/execution/partitionable_hashtable.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "duckdb/execution/aggregate_hashtable.hpp" - -namespace duckdb { - -struct RadixPartitionInfo { - explicit RadixPartitionInfo(idx_t n_partitions_upper_bound); - const idx_t n_partitions; - const idx_t radix_bits; - const hash_t radix_mask; - const idx_t radix_shift; - - inline hash_t GetHashPartition(hash_t hash) const { - return (hash & radix_mask) >> radix_shift; - } -}; - -typedef vector> HashTableList; // NOLINT - -class PartitionableHashTable { -public: - PartitionableHashTable(ClientContext &context, Allocator &allocator, RadixPartitionInfo &partition_info_p, - vector group_types_p, vector payload_types_p, - vector bindings_p); - - idx_t AddChunk(DataChunk &groups, DataChunk &payload, bool do_partition, const unsafe_vector &filter); - void Partition(bool sink_done); - bool IsPartitioned(); - - HashTableList GetPartition(idx_t partition); - HashTableList GetUnpartitioned(); - idx_t GetPartitionCount(idx_t partition) const; - idx_t GetPartitionSize(idx_t partition) const; - - void Finalize(); - - void Append(GroupedAggregateHashTable &ht); - -private: - ClientContext &context; - Allocator &allocator; - vector group_types; - vector payload_types; - vector bindings; - - bool is_partitioned; - RadixPartitionInfo &partition_info; - vector sel_vectors; - unsafe_vector sel_vector_sizes; - DataChunk group_subset, payload_subset; - Vector hashes, hashes_subset; - AggregateHTAppendState append_state; - - HashTableList unpartitioned_hts; - vector radix_partitioned_hts; - idx_t tuple_size; - -private: - idx_t ListAddChunk(HashTableList &list, DataChunk &groups, Vector &group_hashes, DataChunk &payload, - const unsafe_vector &filter); - //! Returns the HT entry size used for intermediate hash tables - HtEntryType GetHTEntrySize(); -}; -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/planner/operator/logical_asof_join.hpp b/src/duckdb/src/include/duckdb/planner/operator/logical_asof_join.hpp deleted file mode 100644 index 5289d6783..000000000 --- a/src/duckdb/src/include/duckdb/planner/operator/logical_asof_join.hpp +++ /dev/null @@ -1,27 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/planner/operator/logical_asof_join.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "duckdb/planner/operator/logical_comparison_join.hpp" - -namespace duckdb { - -//! LogicalAsOfJoin represents a temporal-style join with one less-than inequality. -//! This inequality matches the greatest value on the right that satisfies the condition. -class LogicalAsOfJoin : public LogicalComparisonJoin { -public: - static constexpr const LogicalOperatorType TYPE = LogicalOperatorType::LOGICAL_ASOF_JOIN; - -public: - explicit LogicalAsOfJoin(JoinType type); - - static unique_ptr Deserialize(LogicalDeserializationState &state, FieldReader &reader); -}; - -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/planner/operator/logical_delim_join.hpp b/src/duckdb/src/include/duckdb/planner/operator/logical_delim_join.hpp deleted file mode 100644 index 62422d969..000000000 --- a/src/duckdb/src/include/duckdb/planner/operator/logical_delim_join.hpp +++ /dev/null @@ -1,32 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/planner/operator/logical_delim_join.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "duckdb/planner/operator/logical_comparison_join.hpp" - -namespace duckdb { - -//! LogicalDelimJoin represents a special "duplicate eliminated" join. This join type is only used for subquery -//! flattening, and involves performing duplicate elimination on the LEFT side which is then pushed into the RIGHT side. -class LogicalDelimJoin : public LogicalComparisonJoin { -public: - static constexpr const LogicalOperatorType TYPE = LogicalOperatorType::LOGICAL_DELIM_JOIN; - -public: - explicit LogicalDelimJoin(JoinType type); - - //! The set of columns that will be duplicate eliminated from the LHS and pushed into the RHS - vector> duplicate_eliminated_columns; - -public: - void Serialize(FieldWriter &writer) const override; - static unique_ptr Deserialize(LogicalDeserializationState &state, FieldReader &reader); -}; - -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/storage/meta_block_reader.hpp b/src/duckdb/src/include/duckdb/storage/meta_block_reader.hpp deleted file mode 100644 index df675fd14..000000000 --- a/src/duckdb/src/include/duckdb/storage/meta_block_reader.hpp +++ /dev/null @@ -1,49 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/storage/meta_block_reader.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "duckdb/common/common.hpp" -#include "duckdb/common/serializer.hpp" -#include "duckdb/storage/block.hpp" -#include "duckdb/storage/buffer/buffer_handle.hpp" - -namespace duckdb { -class BlockHandle; -class BlockManager; -class BufferHandle; -class DatabaseInstance; - -//! This struct is responsible for reading meta data from disk -class MetaBlockReader : public Deserializer { -public: - MetaBlockReader(BlockManager &block_manager, block_id_t block, bool free_blocks_on_read = true); - ~MetaBlockReader() override; - - BlockManager &block_manager; - shared_ptr block; - BufferHandle handle; - idx_t offset; - block_id_t next_block; - bool free_blocks_on_read; - -public: - //! Read content of size read_size into the buffer - void ReadData(data_ptr_t buffer, idx_t read_size) override; - - ClientContext &GetContext() override; - optional_ptr GetCatalog() override; - void SetCatalog(Catalog &catalog_p); - void SetContext(ClientContext &context_p); - -private: - void ReadNewBlock(block_id_t id); - optional_ptr context; - optional_ptr catalog; -}; -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/storage/meta_block_writer.hpp b/src/duckdb/src/include/duckdb/storage/meta_block_writer.hpp deleted file mode 100644 index f4d5cc713..000000000 --- a/src/duckdb/src/include/duckdb/storage/meta_block_writer.hpp +++ /dev/null @@ -1,50 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/storage/meta_block_writer.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "duckdb/common/common.hpp" -#include "duckdb/common/serializer.hpp" -#include "duckdb/storage/block.hpp" -#include "duckdb/storage/block_manager.hpp" -#include "duckdb/common/set.hpp" - -namespace duckdb { -class DatabaseInstance; - -//! This struct is responsible for writing data to disk in a stream of blocks. -class MetaBlockWriter : public Serializer { -public: - MetaBlockWriter(BlockManager &block_manager, block_id_t initial_block_id = INVALID_BLOCK); - ~MetaBlockWriter() override; - - BlockManager &block_manager; - -protected: - unique_ptr block; - set written_blocks; - idx_t offset; - -public: - BlockPointer GetBlockPointer(); - virtual void Flush(); - - void WriteData(const_data_ptr_t buffer, idx_t write_size) override; - - void MarkWrittenBlocks() { - for (auto &block_id : written_blocks) { - block_manager.MarkBlockAsModified(block_id); - } - } - -protected: - virtual block_id_t GetNextBlockId(); - void AdvanceBlock(); -}; - -} // namespace duckdb diff --git a/src/duckdb/src/optimizer/statistics/expression/propagate_and_compress.cpp b/src/duckdb/src/optimizer/statistics/expression/propagate_and_compress.cpp deleted file mode 100644 index 7b25365b4..000000000 --- a/src/duckdb/src/optimizer/statistics/expression/propagate_and_compress.cpp +++ /dev/null @@ -1,118 +0,0 @@ -#include "duckdb/function/scalar/operators.hpp" -#include "duckdb/optimizer/statistics_propagator.hpp" -#include "duckdb/planner/bound_result_modifier.hpp" -#include "duckdb/planner/expression/bound_cast_expression.hpp" -#include "duckdb/planner/expression/bound_constant_expression.hpp" -#include "duckdb/planner/expression/bound_function_expression.hpp" -#include "duckdb/storage/statistics/base_statistics.hpp" -#include "duckdb/common/operator/subtract.hpp" - -namespace duckdb { - -template -bool GetCastType(T signed_range, LogicalType &cast_type) { - auto range = static_cast::type>(signed_range); - - // Check if this range fits in a smaller type - if (range < NumericLimits::Maximum()) { - cast_type = LogicalType::UTINYINT; - } else if (sizeof(T) > sizeof(uint16_t) && range < NumericLimits::Maximum()) { - cast_type = LogicalType::USMALLINT; - } else if (sizeof(T) > sizeof(uint32_t) && range < NumericLimits::Maximum()) { - cast_type = LogicalType::UINTEGER; - } else { - return false; - } - return true; -} - -template <> -bool GetCastType(hugeint_t range, LogicalType &cast_type) { - if (range < NumericLimits().Maximum()) { - cast_type = LogicalType::UTINYINT; - } else if (range < NumericLimits().Maximum()) { - cast_type = LogicalType::USMALLINT; - } else if (range < NumericLimits().Maximum()) { - cast_type = LogicalType::UINTEGER; - } else if (range < NumericLimits().Maximum()) { - cast_type = LogicalType::UBIGINT; - } else { - return false; - } - return true; -} - -template -unique_ptr TemplatedCastToSmallestType(unique_ptr expr, BaseStatistics &stats) { - // Compute range - if (!NumericStats::HasMinMax(stats)) { - return expr; - } - - auto signed_min_val = NumericStats::Min(stats).GetValue(); - auto signed_max_val = NumericStats::Max(stats).GetValue(); - if (signed_max_val < signed_min_val) { - return expr; - } - - // Compute range, cast to unsigned to prevent comparing signed with unsigned - T signed_range; - if (!TrySubtractOperator::Operation(signed_max_val, signed_min_val, signed_range)) { - // overflow in subtraction: cannot do any simplification - return expr; - } - - // Check if this range fits in a smaller type - LogicalType cast_type; - if (!GetCastType(signed_range, cast_type)) { - return expr; - } - - // Create expression to map to a smaller range - auto input_type = expr->return_type; - auto minimum_expr = make_uniq(Value::CreateValue(signed_min_val)); - vector> arguments; - arguments.push_back(std::move(expr)); - arguments.push_back(std::move(minimum_expr)); - auto minus_expr = make_uniq(input_type, SubtractFun::GetFunction(input_type, input_type), - std::move(arguments), nullptr, true); - - // Cast to smaller type - return BoundCastExpression::AddDefaultCastToType(std::move(minus_expr), cast_type); -} - -unique_ptr CastToSmallestType(unique_ptr expr, BaseStatistics &num_stats) { - auto physical_type = expr->return_type.InternalType(); - switch (physical_type) { - case PhysicalType::UINT8: - case PhysicalType::INT8: - return expr; - case PhysicalType::UINT16: - return TemplatedCastToSmallestType(std::move(expr), num_stats); - case PhysicalType::INT16: - return TemplatedCastToSmallestType(std::move(expr), num_stats); - case PhysicalType::UINT32: - return TemplatedCastToSmallestType(std::move(expr), num_stats); - case PhysicalType::INT32: - return TemplatedCastToSmallestType(std::move(expr), num_stats); - case PhysicalType::UINT64: - return TemplatedCastToSmallestType(std::move(expr), num_stats); - case PhysicalType::INT64: - return TemplatedCastToSmallestType(std::move(expr), num_stats); - case PhysicalType::INT128: - return TemplatedCastToSmallestType(std::move(expr), num_stats); - default: - throw NotImplementedException("Unknown integer type!"); - } -} - -void StatisticsPropagator::PropagateAndCompress(unique_ptr &expr, unique_ptr &stats) { - stats = PropagateExpression(expr); - if (stats) { - if (expr->return_type.IsIntegral()) { - expr = CastToSmallestType(std::move(expr), *stats); - } - } -} - -} // namespace duckdb diff --git a/src/duckdb/src/parser/common_table_expression_info.cpp b/src/duckdb/src/parser/common_table_expression_info.cpp deleted file mode 100644 index 1058908a7..000000000 --- a/src/duckdb/src/parser/common_table_expression_info.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include "duckdb/parser/common_table_expression_info.hpp" -#include "duckdb/common/serializer/format_serializer.hpp" -#include "duckdb/common/serializer/format_deserializer.hpp" - -namespace duckdb { - -void CommonTableExpressionInfo::FormatSerialize(FormatSerializer &serializer) const { - serializer.WriteProperty("aliases", aliases); - serializer.WriteProperty("query", query); -} - -unique_ptr CommonTableExpressionInfo::FormatDeserialize(FormatDeserializer &deserializer) { - auto result = make_uniq(); - result->aliases = deserializer.ReadProperty>("aliases"); - result->query = deserializer.ReadProperty>("query"); - return result; -} - -} // namespace duckdb diff --git a/src/duckdb/src/planner/operator/logical_asof_join.cpp b/src/duckdb/src/planner/operator/logical_asof_join.cpp deleted file mode 100644 index 95cc415b3..000000000 --- a/src/duckdb/src/planner/operator/logical_asof_join.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include "duckdb/planner/operator/logical_asof_join.hpp" - -namespace duckdb { - -LogicalAsOfJoin::LogicalAsOfJoin(JoinType type) : LogicalComparisonJoin(type, LogicalOperatorType::LOGICAL_ASOF_JOIN) { -} - -unique_ptr LogicalAsOfJoin::Deserialize(LogicalDeserializationState &state, FieldReader &reader) { - auto result = make_uniq(JoinType::INVALID); - LogicalComparisonJoin::Deserialize(*result, state, reader); - return std::move(result); -} - -} // namespace duckdb diff --git a/src/duckdb/src/planner/operator/logical_delim_join.cpp b/src/duckdb/src/planner/operator/logical_delim_join.cpp deleted file mode 100644 index 33360fbcd..000000000 --- a/src/duckdb/src/planner/operator/logical_delim_join.cpp +++ /dev/null @@ -1,27 +0,0 @@ -#include "duckdb/common/field_writer.hpp" -#include "duckdb/planner/operator/logical_delim_join.hpp" - -namespace duckdb { - -LogicalDelimJoin::LogicalDelimJoin(JoinType type) - : LogicalComparisonJoin(type, LogicalOperatorType::LOGICAL_DELIM_JOIN) { -} - -void LogicalDelimJoin::Serialize(FieldWriter &writer) const { - LogicalComparisonJoin::Serialize(writer); - if (type == LogicalOperatorType::LOGICAL_COMPARISON_JOIN) { - D_ASSERT(duplicate_eliminated_columns.empty()); - // if the delim join has no delim columns anymore it is turned into a regular comparison join - return; - } - writer.WriteSerializableList(duplicate_eliminated_columns); -} - -unique_ptr LogicalDelimJoin::Deserialize(LogicalDeserializationState &state, FieldReader &reader) { - auto result = make_uniq(JoinType::INVALID); - LogicalComparisonJoin::Deserialize(*result, state, reader); - result->duplicate_eliminated_columns = reader.ReadRequiredSerializableList(state.gstate); - return std::move(result); -} - -} // namespace duckdb diff --git a/src/duckdb/src/storage/meta_block_reader.cpp b/src/duckdb/src/storage/meta_block_reader.cpp deleted file mode 100644 index d31f5f1c1..000000000 --- a/src/duckdb/src/storage/meta_block_reader.cpp +++ /dev/null @@ -1,78 +0,0 @@ -#include "duckdb/storage/meta_block_reader.hpp" -#include "duckdb/storage/buffer_manager.hpp" -#include "duckdb/main/connection_manager.hpp" -#include "duckdb/main/database.hpp" - -#include - -namespace duckdb { - -MetaBlockReader::MetaBlockReader(BlockManager &block_manager, block_id_t block_id, bool free_blocks_on_read) - : block_manager(block_manager), offset(0), next_block(-1), free_blocks_on_read(free_blocks_on_read) { - ReadNewBlock(block_id); -} - -MetaBlockReader::~MetaBlockReader() { -} - -void MetaBlockReader::ReadData(data_ptr_t buffer, idx_t read_size) { - while (offset + read_size > handle.GetFileBuffer().size) { - // cannot read entire entry from block - // first read what we can from this block - idx_t to_read = handle.GetFileBuffer().size - offset; - if (to_read > 0) { - memcpy(buffer, handle.Ptr() + offset, to_read); - read_size -= to_read; - buffer += to_read; - } - // then move to the next block - if (next_block == INVALID_BLOCK) { - throw IOException("Cannot read from INVALID_BLOCK."); - } - ReadNewBlock(next_block); - } - // we have enough left in this block to read from the buffer - memcpy(buffer, handle.Ptr() + offset, read_size); - offset += read_size; -} - -ClientContext &MetaBlockReader::GetContext() { - if (!context) { - throw InternalException("Meta Block Reader is missing context"); - } - return *context; -} - -optional_ptr MetaBlockReader::GetCatalog() { - return catalog; -} - -void MetaBlockReader::ReadNewBlock(block_id_t id) { - auto &buffer_manager = block_manager.buffer_manager; - - // Marking these blocks as modified will cause them to be moved to the free - // list upon the next successful checkpoint. Marking them modified here - // assumes MetaBlockReader is exclusively used for reading checkpoint data, - // and thus any blocks we're reading will be obviated by the next checkpoint. - if (free_blocks_on_read) { - block_manager.MarkBlockAsModified(id); - } - block = block_manager.RegisterBlock(id, true); - handle = buffer_manager.Pin(block); - - next_block = Load(handle.Ptr()); - D_ASSERT(next_block >= -1); - offset = sizeof(block_id_t); -} - -void MetaBlockReader::SetCatalog(Catalog &catalog_p) { - D_ASSERT(!catalog); - catalog = &catalog_p; -} - -void MetaBlockReader::SetContext(ClientContext &context_p) { - D_ASSERT(!context); - context = &context_p; -} - -} // namespace duckdb diff --git a/src/duckdb/src/storage/meta_block_writer.cpp b/src/duckdb/src/storage/meta_block_writer.cpp deleted file mode 100644 index 918b10601..000000000 --- a/src/duckdb/src/storage/meta_block_writer.cpp +++ /dev/null @@ -1,80 +0,0 @@ -#include "duckdb/storage/meta_block_writer.hpp" - -#include - -namespace duckdb { - -MetaBlockWriter::MetaBlockWriter(BlockManager &block_manager, block_id_t initial_block_id) - : block_manager(block_manager) { - if (initial_block_id == INVALID_BLOCK) { - initial_block_id = MetaBlockWriter::GetNextBlockId(); - } - block = block_manager.CreateBlock(initial_block_id, nullptr); - Store(-1, block->buffer); - offset = sizeof(block_id_t); -} - -MetaBlockWriter::~MetaBlockWriter() { - // If there's an exception during checkpoint, this can get destroyed without - // flushing the data...which is fine, because none of the unwritten data - // will be referenced. - // - // Otherwise, we should have explicitly flushed (and thereby nulled the block). - D_ASSERT(!block || Exception::UncaughtException()); -} - -block_id_t MetaBlockWriter::GetNextBlockId() { - return block_manager.GetFreeBlockId(); -} - -BlockPointer MetaBlockWriter::GetBlockPointer() { - BlockPointer pointer; - pointer.block_id = block->id; - pointer.offset = offset; - return pointer; -} - -void MetaBlockWriter::Flush() { - if (offset < block->size) { - // clear remaining bytes of block (if any) - memset(block->buffer + offset, 0, block->size - offset); - } - AdvanceBlock(); - block = nullptr; -} - -void MetaBlockWriter::AdvanceBlock() { - written_blocks.insert(block->id); - if (offset > sizeof(block_id_t)) { - block_manager.Write(*block); - offset = sizeof(block_id_t); - } -} - -void MetaBlockWriter::WriteData(const_data_ptr_t buffer, idx_t write_size) { - while (offset + write_size > block->size) { - // we need to make a new block - // first copy what we can - D_ASSERT(offset <= block->size); - idx_t copy_amount = block->size - offset; - if (copy_amount > 0) { - memcpy(block->buffer + offset, buffer, copy_amount); - buffer += copy_amount; - offset += copy_amount; - write_size -= copy_amount; - } - // now we need to get a new block id - block_id_t new_block_id = GetNextBlockId(); - // write the block id of the new block to the start of the current block - Store(new_block_id, block->buffer); - // first flush the old block - AdvanceBlock(); - // now update the block id of the block - block->id = new_block_id; - Store(-1, block->buffer); - } - memcpy(block->buffer + offset, buffer, write_size); - offset += write_size; -} - -} // namespace duckdb