update usearch, add lock, we should switch to merging instead

duckdb · Mar 19, 2024 · 58f40f7 · 58f40f7
1 parent 95ec4d9
commit 58f40f7
Show file tree

Hide file tree

Showing 5 changed files with 4,101 additions and 4,917 deletions.
diff --git a/README.md b/README.md
@@ -1,13 +1,13 @@
 # DuckDB-VSS
 
-Vector Similarity Search Extension (based on usearch)
+Vector Similarity Search for DuckDB
 
 This is an experimental extension for DuckDB that adds indexing support to accelerate Vector Similarity Search using DuckDB's new fixed-size `ARRAY` type added in version v0.10.0. 
 This extension is based on the [usearch](https://github.com/unum-cloud/usearch) library and serves as a proof of concept for providing a custom index type, in this case a HNSW index, from within an extension and exposing it to DuckDB.
 
 ## Usage
 
-To create a new HNSW index on a table, use the `CREATE INDEX` statement with the `USING HNSW` clause. For example:
+To create a new HNSW index on a table with an `ARRAY` column, use the `CREATE INDEX` statement with the `USING HNSW` clause. For example:
 ```sql
 CREATE TABLE my_vector_table (vec FLOAT[3]);
 INSERT INTO my_vector_table SELECT array_value(a,b,c) FROM range(1,10) ra(a), range(1,10) rb(b), range(1,10) rc(c);

diff --git a/src/hnsw/hnsw_index.cpp b/src/hnsw/hnsw_index.cpp
@@ -267,22 +267,42 @@ void HNSWIndex::CommitDrop(IndexLock &index_lock) {
 	root_block_ptr.Clear();
 }
 
+inline idx_t NextPowerOfTwo(idx_t v) {
+	v--;
+	v |= v >> 1;
+	v |= v >> 2;
+	v |= v >> 4;
+	v |= v >> 8;
+	v |= v >> 16;
+	v |= v >> 32;
+	v++;
+	return v;
+}
+
 void HNSWIndex::Construct(DataChunk &input, Vector &row_ids, idx_t thread_idx) {
 	D_ASSERT(row_ids.GetType().InternalType() == ROW_TYPE);
 	D_ASSERT(logical_types[0] == input.data[0].GetType());
 
 	auto count = input.size();
 	input.Flatten();
 
-	// TODO: Do we need to track this atomically globally?
-	index.reserve(index.capacity() + count);
-
 	auto &vec_vec = input.data[0];
 	auto &vec_child_vec = ArrayVector::GetEntry(vec_vec);
 	auto array_size = ArrayType::GetSize(vec_vec.GetType());
 
 	auto vec_child_data = FlatVector::GetData<float>(vec_child_vec);
 	auto rowid_data = FlatVector::GetData<row_t>(row_ids);
+
+	// lock_guard<mutex> lock(hnsw_index_mutex);
+	// TODO: Do we need to track this atomically globally?
+	// Better strategy: Create multiple small indexes and merge!
+	static mutex hnsw_index_mutex;
+	lock_guard<mutex> lock(hnsw_index_mutex);
+
+	if(!index.reserve(NextPowerOfTwo(index.size() + count))) {
+		throw InternalException("Failed to reserve space in the HNSW index");
+	}
+
 	for (idx_t out_idx = 0; out_idx < count; out_idx++) {
 		auto rowid = rowid_data[out_idx];
 		auto result = index.add(rowid, vec_child_data + (out_idx * array_size), thread_idx);