Skip to content

Commit

Permalink
Merge pull request #13 from Maxxen/main
Browse files Browse the repository at this point in the history
Port forward changes from v0.10.2
  • Loading branch information
Maxxen authored May 6, 2024
2 parents 8145f41 + a35c248 commit dbf5b74
Show file tree
Hide file tree
Showing 26 changed files with 305 additions and 201 deletions.
17 changes: 13 additions & 4 deletions .github/workflows/MainDistributionPipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,18 @@
#
name: Main Extension Distribution Pipeline
on:
push:
pull_request:
branches:
- main
paths-ignore:
- '**/README.md'
- 'doc/**'
push:
branches:
- main
paths-ignore:
- '**/README.md'
- 'doc/**'
workflow_dispatch:

concurrency:
Expand All @@ -14,7 +24,7 @@ concurrency:
jobs:
duckdb-stable-build:
name: Build extension binaries
uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v0.10.1
uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main
with:
vcpkg_commit: a42af01b72c28a8e1d7b48107b33e4f286a55ef6
duckdb_version: main
Expand All @@ -23,10 +33,9 @@ jobs:
duckdb-stable-deploy:
name: Deploy extension binaries
needs: duckdb-stable-build
uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@v0.10.1
uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@main
secrets: inherit
with:
duckdb_version: main
extension_name: vss
deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }}
deploy_versioned: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }}
41 changes: 41 additions & 0 deletions .github/workflows/StableDistributionPipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#
# This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension
#
name: Stable Extension Distribution Pipeline
on:
pull_request:
branches:
- v0.10.2
paths-ignore:
- '**/README.md'
- 'doc/**'
push:
branches:
- v0.10.2
paths-ignore:
- '**/README.md'
- 'doc/**'
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/v0.10.2' || github.sha }}
cancel-in-progress: true

jobs:
duckdb-stable-build:
name: Build extension binaries
uses: duckdb/extension-ci-tools/.github/workflows/[email protected]
with:
vcpkg_commit: a42af01b72c28a8e1d7b48107b33e4f286a55ef6
duckdb_version: v0.10.2
extension_name: vss

duckdb-stable-deploy:
name: Deploy extension binaries
needs: duckdb-stable-build
uses: duckdb/extension-ci-tools/.github/workflows/[email protected]
secrets: inherit
with:
duckdb_version: v0.10.2
extension_name: vss
deploy_latest: ${{ startsWith(github.ref, 'refs/heads/v') || github.ref == 'refs/heads/main' }}
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@
path = duckdb
url = https://github.com/duckdb/duckdb
branch = main
[submodule "extension-ci-tools"]
path = extension-ci-tools
url = https://github.com/duckdb/extension-ci-tools.git
140 changes: 5 additions & 135 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,138 +1,8 @@
.PHONY: all clean format debug release duckdb_debug duckdb_release pull update wasm_mvp wasm_eh wasm_threads

all: release

MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
PROJ_DIR := $(dir $(MKFILE_PATH))

TEST_PATH="/test/unittest"
DUCKDB_PATH="/duckdb"

# For non-MinGW windows the path is slightly different
ifeq ($(OS),Windows_NT)
ifneq ($(CXX),g++)
TEST_PATH="/test/Release/unittest.exe"
DUCKDB_PATH="/Release/duckdb.exe"
endif
endif

#### OSX config
OSX_BUILD_FLAG=
ifneq (${OSX_BUILD_ARCH}, "")
OSX_BUILD_FLAG=-DOSX_BUILD_ARCH=${OSX_BUILD_ARCH}
endif

#### VCPKG config
VCPKG_TOOLCHAIN_PATH?=
ifneq ("${VCPKG_TOOLCHAIN_PATH}", "")
TOOLCHAIN_FLAGS:=${TOOLCHAIN_FLAGS} -DVCPKG_MANIFEST_DIR='${PROJ_DIR}' -DVCPKG_BUILD=1 -DCMAKE_TOOLCHAIN_FILE='${VCPKG_TOOLCHAIN_PATH}'
endif
ifneq ("${VCPKG_TARGET_TRIPLET}", "")
TOOLCHAIN_FLAGS:=${TOOLCHAIN_FLAGS} -DVCPKG_TARGET_TRIPLET='${VCPKG_TARGET_TRIPLET}'
endif

#### Enable Ninja as generator
ifeq ($(GEN),ninja)
GENERATOR=-G "Ninja" -DFORCE_COLORED_OUTPUT=1
endif
PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))

# Configuration of extension
EXT_NAME=vss
EXT_CONFIG=${PROJ_DIR}extension_config.cmake

#### Configuration for this extension
EXTENSION_NAME=VSS
EXTENSION_FLAGS=\
-DDUCKDB_EXTENSION_NAMES="vss" \
-DDUCKDB_EXTENSION_${EXTENSION_NAME}_PATH="$(PROJ_DIR)" \
-DDUCKDB_EXTENSION_${EXTENSION_NAME}_LOAD_TESTS=1 \
-DDUCKDB_EXTENSION_${EXTENSION_NAME}_INCLUDE_PATH="$(PROJ_DIR)src/include" \
-DDUCKDB_EXTENSION_${EXTENSION_NAME}_TEST_PATH="$(PROJ_DIR)test/sql"

#### Add more of the DuckDB in-tree extensions here that you need (also feel free to remove them when not needed)
EXTRA_EXTENSIONS_FLAG=-DBUILD_EXTENSIONS="tpch"

BUILD_FLAGS=-DEXTENSION_STATIC_BUILD=1 $(EXTENSION_FLAGS) ${EXTRA_EXTENSIONS_FLAG} $(OSX_BUILD_FLAG) $(TOOLCHAIN_FLAGS) -DDUCKDB_EXPLICIT_PLATFORM='${DUCKDB_PLATFORM}'
CLIENT_FLAGS:=

#### Main build
# For regular CLI build, we link the vss extension directly into the DuckDB executable
CLIENT_FLAGS=-DDUCKDB_EXTENSION_${EXTENSION_NAME}_SHOULD_LINK=1

debug:
mkdir -p build/debug && \
cmake $(GENERATOR) $(BUILD_FLAGS) $(CLIENT_FLAGS) -DCMAKE_BUILD_TYPE=Debug -S ./duckdb/ -B build/debug && \
cmake --build build/debug --config Debug

release:
mkdir -p build/release && \
cmake $(GENERATOR) $(BUILD_FLAGS) $(CLIENT_FLAGS) -DCMAKE_BUILD_TYPE=Release -S ./duckdb/ -B build/release && \
cmake --build build/release --config Release

##### Client build
JS_BUILD_FLAGS=-DBUILD_NODE=1 -DDUCKDB_EXTENSION_${EXTENSION_NAME}_SHOULD_LINK=0
PY_BUILD_FLAGS=-DBUILD_PYTHON=1 -DDUCKDB_EXTENSION_${EXTENSION_NAME}_SHOULD_LINK=0

debug_js: CLIENT_FLAGS=$(JS_BUILD_FLAGS)
debug_js: debug
debug_python: CLIENT_FLAGS=$(PY_BUILD_FLAGS)
debug_python: debug
release_js: CLIENT_FLAGS=$(JS_BUILD_FLAGS)
release_js: release
release_python: CLIENT_FLAGS=$(PY_BUILD_FLAGS)
release_python: release

# Main tests
test: test_release
test_release: release
./build/release/$(TEST_PATH) "$(PROJ_DIR)test/*"
test_debug: debug
./build/debug/$(TEST_PATH) "$(PROJ_DIR)test/*"

#### Client tests
DEBUG_EXT_PATH='$(PROJ_DIR)build/debug/extension/vss/vss.duckdb_extension'
RELEASE_EXT_PATH='$(PROJ_DIR)build/release/extension/vss/vss.duckdb_extension'
test_js: test_debug_js
test_debug_js: debug_js
cd duckdb/tools/nodejs && ${EXTENSION_NAME}_EXTENSION_BINARY_PATH=$(DEBUG_EXT_PATH) npm run test-path -- "../../../test/nodejs/**/*.js"
test_release_js: release_js
cd duckdb/tools/nodejs && ${EXTENSION_NAME}_EXTENSION_BINARY_PATH=$(RELEASE_EXT_PATH) npm run test-path -- "../../../test/nodejs/**/*.js"
test_python: test_debug_python
test_debug_python: debug_python
cd test/python && ${EXTENSION_NAME}_EXTENSION_BINARY_PATH=$(DEBUG_EXT_PATH) python3 -m pytest
test_release_python: release_python
cd test/python && ${EXTENSION_NAME}_EXTENSION_BINARY_PATH=$(RELEASE_EXT_PATH) python3 -m pytest

#### Misc
format:
find src/ -iname *.hpp -o -iname *.cpp | xargs clang-format --sort-includes=0 -style=file -i
cmake-format -i CMakeLists.txt
update:
git submodule update --remote --merge
pull:
git submodule init
git submodule update --recursive --remote

clean:
rm -rf build
rm -rf testext
cd duckdb && make clean
cd duckdb && make clean-python

WASM_LINK_TIME_FLAGS=

wasm_mvp:
mkdir -p build/wasm_mvp
emcmake cmake $(GENERATOR) -DWASM_LOADABLE_EXTENSIONS=1 -DBUILD_EXTENSIONS_ONLY=1 -Bbuild/wasm_mvp -DCMAKE_CXX_FLAGS="-DDUCKDB_CUSTOM_PLATFORM=wasm_mvp" -DSKIP_EXTENSIONS="parquet" -S duckdb $(TOOLCHAIN_FLAGS) $(EXTENSION_FLAGS) -DVCPKG_CHAINLOAD_TOOLCHAIN_FILE=$(EMSDK)/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DDUCKDB_EXPLICIT_PLATFORM='${DUCKDB_PLATFORM}'
emmake make -j8 -Cbuild/wasm_mvp
cd build/wasm_mvp/extension/${EXT_NAME} && emcc $f -sSIDE_MODULE=1 -o ../../${EXT_NAME}.duckdb_extension.wasm -O3 ${EXT_NAME}.duckdb_extension.wasm $(WASM_LINK_TIME_FLAGS)

wasm_eh:
mkdir -p build/wasm_eh
emcmake cmake $(GENERATOR) -DWASM_LOADABLE_EXTENSIONS=1 -DBUILD_EXTENSIONS_ONLY=1 -Bbuild/wasm_eh -DCMAKE_CXX_FLAGS="-fwasm-exceptions -DWEBDB_FAST_EXCEPTIONS=1 -DDUCKDB_CUSTOM_PLATFORM=wasm_eh" -DSKIP_EXTENSIONS="parquet" -S duckdb $(TOOLCHAIN_FLAGS) $(EXTENSION_FLAGS) -DVCPKG_CHAINLOAD_TOOLCHAIN_FILE=$(EMSDK)/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DDUCKDB_EXPLICIT_PLATFORM='${DUCKDB_PLATFORM}'
emmake make -j8 -Cbuild/wasm_eh
cd build/wasm_eh/extension/${EXT_NAME} && emcc $f -sSIDE_MODULE=1 -o ../../${EXT_NAME}.duckdb_extension.wasm -O3 ${EXT_NAME}.duckdb_extension.wasm $(WASM_LINK_TIME_FLAGS)

wasm_threads:
mkdir -p ./build/wasm_threads
emcmake cmake $(GENERATOR) -DWASM_LOADABLE_EXTENSIONS=1 -DBUILD_EXTENSIONS_ONLY=1 -Bbuild/wasm_threads -DCMAKE_CXX_FLAGS="-fwasm-exceptions -DWEBDB_FAST_EXCEPTIONS=1 -DWITH_WASM_THREADS=1 -DWITH_WASM_SIMD=1 -DWITH_WASM_BULK_MEMORY=1 -DDUCKDB_CUSTOM_PLATFORM=wasm_threads" -DSKIP_EXTENSIONS="parquet" -S duckdb $(TOOLCHAIN_FLAGS) $(EXTENSION_FLAGS) -DVCPKG_CHAINLOAD_TOOLCHAIN_FILE=$(EMSDK)/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DDUCKDB_EXPLICIT_PLATFORM='${DUCKDB_PLATFORM}'
emmake make -j8 -Cbuild/wasm_threads
cd build/wasm_threads/extension/${EXT_NAME} && emcc $f -sSIDE_MODULE=1 -o ../../${EXT_NAME}.duckdb_extension.wasm -O3 ${EXT_NAME}.duckdb_extension.wasm $(WASM_LINK_TIME_FLAGS)
# Include the Makefile from extension-ci-tools
include extension-ci-tools/makefiles/duckdb_extension.Makefile
2 changes: 1 addition & 1 deletion duckdb
Submodule duckdb updated 1028 files
1 change: 1 addition & 0 deletions extension-ci-tools
Submodule extension-ci-tools added at 54facf
10 changes: 10 additions & 0 deletions extension_config.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# This file is included by DuckDB's build system. It specifies which extension to load

# Extension from this repo
duckdb_extension_load(vss
SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}
LOAD_TESTS
)

# Any extra extensions that should be built
# e.g.: duckdb_extension_load(json)
42 changes: 34 additions & 8 deletions src/hnsw/hnsw_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ HNSWIndex::HNSWIndex(const string &name, IndexConstraintType index_constraint_ty
config.connectivity_base = m0_opt->second.GetValue<int32_t>();
}

index = unum::usearch::index_dense_t::make(metric, config);
index = unum::usearch::index_dense_gt<row_t>::make(metric, config);

auto lock = rwlock.GetExclusiveLock();
// Is this a new index or an existing index?
Expand All @@ -199,7 +199,7 @@ HNSWIndex::HNSWIndex(const string &name, IndexConstraintType index_constraint_ty
linked_block_allocator->Init(info.allocator_infos[0]);

// Is there anything to deserialize? We could have an empty index
if(!info.allocator_infos[0].buffer_ids.empty()) {
if (!info.allocator_infos[0].buffer_ids.empty()) {
LinkedBlockReader reader(*linked_block_allocator, root_block_ptr);
index.load_from_stream(
[&](void *data, size_t size) { return size == reader.ReadData(static_cast<data_ptr_t>(data), size); });
Expand Down Expand Up @@ -299,18 +299,31 @@ struct HNSWIndexScanState : public IndexScanState {
unique_array<row_t> row_ids = nullptr;
};

unique_ptr<IndexScanState> HNSWIndex::InitializeScan(float *query_vector, idx_t limit) {
unique_ptr<IndexScanState> HNSWIndex::InitializeScan(float *query_vector, idx_t limit, ClientContext &context) {
auto state = make_uniq<HNSWIndexScanState>();

// Try to get the ef_search parameter from the database or use the default value
auto ef_search = index.expansion_search();

Value hnsw_ef_search_opt;
if(context.TryGetCurrentSetting("hnsw_ef_search", hnsw_ef_search_opt)) {
if(!hnsw_ef_search_opt.IsNull() && hnsw_ef_search_opt.type() == LogicalType::BIGINT) {
auto val = hnsw_ef_search_opt.GetValue<int64_t>();
if(val > 0) {
ef_search = static_cast<idx_t>(val);
}
}
}

// Acquire a shared lock to search the index
auto lock = rwlock.GetSharedLock();
auto search_result = index.search(query_vector, limit);
auto search_result = index.ef_search(query_vector, limit, ef_search);

state->current_row = 0;
state->total_rows = search_result.size();
state->row_ids = make_uniq_array<row_t>(search_result.size());

search_result.dump_to(reinterpret_cast<uint64_t *>(state->row_ids.get()));
search_result.dump_to(state->row_ids.get());
return std::move(state);
}

Expand Down Expand Up @@ -431,8 +444,16 @@ ErrorData HNSWIndex::Insert(IndexLock &lock, DataChunk &input, Vector &rowid_vec
return ErrorData {};
}

ErrorData HNSWIndex::Append(IndexLock &lock, DataChunk &entries, Vector &rowid_vec) {
Construct(entries, rowid_vec, unum::usearch::index_dense_t::any_thread());
ErrorData HNSWIndex::Append(IndexLock &lock, DataChunk &appended_data, Vector &row_identifiers) {
DataChunk expression_result;
expression_result.Initialize(Allocator::DefaultAllocator(), logical_types);

// first resolve the expressions for the index
ExecuteExpressions(appended_data, expression_result);

// now insert into the index
Construct(expression_result, row_identifiers, unum::usearch::index_dense_t::any_thread());

return ErrorData {};
}

Expand Down Expand Up @@ -480,7 +501,7 @@ IndexStorageInfo HNSWIndex::GetStorageInfo(const bool get_buffers) {
if (!get_buffers) {
// use the partial block manager to serialize all allocator data
auto &block_manager = table_io_manager.GetIndexBlockManager();
PartialBlockManager partial_block_manager(block_manager, CheckpointType::FULL_CHECKPOINT);
PartialBlockManager partial_block_manager(block_manager, PartialBlockType::FULL_CHECKPOINT);
linked_block_allocator->SerializeBuffers(partial_block_manager);
partial_block_manager.FlushPartialBlocks();
} else {
Expand Down Expand Up @@ -525,6 +546,11 @@ void HNSWModule::RegisterIndex(DatabaseInstance &db) {
return std::move(res);
};

// Register scan option
db.config.AddExtensionOption("hnsw_ef_search",
"experimental: override the ef_search parameter when scanning HNSW indexes",
LogicalType::BIGINT);

// Register the index type
db.config.GetIndexTypes().RegisterIndexType(index_type);
}
Expand Down
4 changes: 2 additions & 2 deletions src/hnsw/hnsw_index_physical_create.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,13 @@ SinkFinalizeType PhysicalCreateHNSWIndex::Finalize(Pipeline &pipeline, Event &ev
// Get the entry as a DuckIndexEntry
auto &index = index_entry->Cast<DuckIndexEntry>();
index.initial_index_size = gstate.global_index->GetInMemorySize();
index.info = make_uniq<IndexDataTableInfo>(storage.info, index.name);
index.info = make_uniq<IndexDataTableInfo>(storage.GetDataTableInfo(), index.name);
for (auto &parsed_expr : info->parsed_expressions) {
index.parsed_expressions.push_back(parsed_expr->Copy());
}

// Finally add it to storage
storage.info->indexes.AddIndex(std::move(gstate.global_index));
storage.GetDataTableInfo()->GetIndexes().AddIndex(std::move(gstate.global_index));

return SinkFinalizeType::READY;
}
Expand Down
12 changes: 8 additions & 4 deletions src/hnsw/hnsw_index_pragmas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,9 @@ static void HNSWIndexInfoExecute(ClientContext &context, TableFunctionInput &dat
auto &storage = table_entry.GetStorage();
HNSWIndex *hnsw_index = nullptr;

storage.info->InitializeIndexes(context);
storage.info->indexes.Scan([&](Index &index) {
auto &table_info = *storage.GetDataTableInfo();
table_info.InitializeIndexes(context);
table_info.GetIndexes().Scan([&](Index &index) {
if (index.name == index_entry.name && index.index_type == HNSWIndex::TYPE_NAME) {
hnsw_index = &index.Cast<HNSWIndex>();
return true;
Expand Down Expand Up @@ -172,8 +173,11 @@ static void CompactIndexPragma(ClientContext &context, const FunctionParameters

auto &storage = table_entry.GetStorage();
bool found_index = false;
storage.info->indexes.Scan([&](Index &index_entry) {
if (index_entry.name == index_name && index_entry.index_type == HNSWIndex::TYPE_NAME) {

auto &table_info = *storage.GetDataTableInfo();
table_info.InitializeIndexes(context);
table_info.GetIndexes().Scan([&](Index &index_entry) {
if (index_entry.name == index_name && index_entry.index_type == HNSWIndex::TYPE_NAME && !index_entry.IsUnknown()) {
auto &hnsw_index = index_entry.Cast<HNSWIndex>();
hnsw_index.Compact();
found_index = true;
Expand Down
2 changes: 1 addition & 1 deletion src/hnsw/hnsw_index_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ static unique_ptr<GlobalTableFunctionState> HNSWIndexScanInitGlobal(ClientContex
local_storage.InitializeScan(bind_data.table.GetStorage(), result->local_storage_state.local_state, input.filters);

// Initialize the scan state for the index
result->index_state = bind_data.index.Cast<HNSWIndex>().InitializeScan(bind_data.query.get(), bind_data.limit);
result->index_state = bind_data.index.Cast<HNSWIndex>().InitializeScan(bind_data.query.get(), bind_data.limit, context);

return std::move(result);
}
Expand Down
Loading

0 comments on commit dbf5b74

Please sign in to comment.