From c0faac17a6d8a97923d434dfd0fe44935a337774 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 9 Jul 2021 23:20:11 -0700 Subject: [PATCH 01/25] Updated the CMakeLists to simply use as a fetch'd library. --- CMakeLists.txt | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 31935e0e..ec91f568 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,28 +1,27 @@ -cmake_minimum_required (VERSION 2.6) -project (hnsw_lib) +cmake_minimum_required (VERSION 3.14) +project(hnsw_lib + LANGUAGES CXX) -include_directories("${PROJECT_BINARY_DIR}") +add_library(hnswlib INTERFACE) +target_include_directories(hnswlib INTERFACE .) +if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) + set(CMAKE_CXX_STANDARD 11) + if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize") + elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" ) + elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" ) + endif() -set(SOURCE_EXE main.cpp) + add_executable(test_updates examples/updates_test.cpp) + target_link_libraries(test_updates hnswlib) -set(SOURCE_LIB sift_1b.cpp) + add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp) + target_link_libraries(searchKnnCloserFirst_test hnswlib) -add_library(sift_test STATIC ${SOURCE_LIB}) - - -add_executable(main ${SOURCE_EXE}) -if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize") -elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") -SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" ) -elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" ) + add_executable(main main.cpp sift_1b.cpp) + target_link_libraries(main hnswlib) endif() - -add_executable(test_updates examples/updates_test.cpp) - -add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp) - -target_link_libraries(main sift_test) From 085b16503161d8c204f753240f96629ca6184054 Mon Sep 17 00:00:00 2001 From: LTLA Date: Sat, 10 Jul 2021 00:26:36 -0700 Subject: [PATCH 02/25] Added the missing header. --- hnswlib/visited_list_pool.h | 1 + 1 file changed, 1 insertion(+) diff --git a/hnswlib/visited_list_pool.h b/hnswlib/visited_list_pool.h index 6b0f4458..5e1a4a58 100644 --- a/hnswlib/visited_list_pool.h +++ b/hnswlib/visited_list_pool.h @@ -2,6 +2,7 @@ #include #include +#include namespace hnswlib { typedef unsigned short int vl_type; From c41ee3bf1f56d326354e0b60f4d859ffa45416d0 Mon Sep 17 00:00:00 2001 From: LTLA Date: Sat, 10 Jul 2021 00:29:20 -0700 Subject: [PATCH 03/25] Made the data getter const. --- hnswlib/hnswalg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index f23c17d9..2bd2eb5e 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -744,7 +744,7 @@ namespace hnswlib { } template - std::vector getDataByLabel(labeltype label) + std::vector getDataByLabel(labeltype label) const { tableint label_c; auto search = label_lookup_.find(label); From ed7c92a4336331af71cfa63d0cd7d33994b2e498 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Mon, 12 Jul 2021 19:43:38 +0300 Subject: [PATCH 04/25] Migrate from travis to github actions --- .github/workflows/build.yml | 22 +++++++++++++ .travis.yml | 63 ------------------------------------- 2 files changed, 22 insertions(+), 63 deletions(-) create mode 100644 .github/workflows/build.yml delete mode 100644 .travis.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..7dfba102 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,22 @@ +name: HNSW CI + +on: [push, pull_request] + +jobs: + test: + runs-on: ${{matrix.os}} + strategy: + matrix: + os: [ubuntu-latest, windows-latest] + python-version: ['3.6', '3.7', '3.8', '3.9'] + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Build and install + run: python -m pip install . + + - name: Test + run: python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py" diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 76f7d7d4..00000000 --- a/.travis.yml +++ /dev/null @@ -1,63 +0,0 @@ -language: python - -jobs: - include: - - name: Linux Python 3.6 - os: linux - python: 3.6 - - - name: Linux Python 3.7 - os: linux - python: 3.7 - - - name: Linux Python 3.8 - os: linux - python: 3.8 - - - name: Linux Python 3.9 - os: linux - python: 3.9 - - - name: Windows Python 3.6 - os: windows - language: shell # 'language: python' is an error on Travis CI Windows - before_install: - - choco install python --version 3.6.0 - - python -m pip install --upgrade pip - - python --version - env: PATH=/c/Python36:/c/Python36/Scripts:$PATH - - - name: Windows Python 3.7 - os: windows - language: shell # 'language: python' is an error on Travis CI Windows - before_install: - - choco install python --version 3.7.0 - - python -m pip install --upgrade pip - - python --version - env: PATH=/c/Python37:/c/Python37/Scripts:$PATH - - - name: Windows Python 3.8 - os: windows - language: shell # 'language: python' is an error on Travis CI Windows - before_install: - - choco install python --version 3.8.0 - - python -m pip install --upgrade pip - - python --version - env: PATH=/c/Python38:/c/Python38/Scripts:$PATH - - - name: Windows Python 3.9 - os: windows - language: shell # 'language: python' is an error on Travis CI Windows - before_install: - - choco install python --version 3.9.0 - - python -m pip install --upgrade pip - - python --version - env: PATH=/c/Python39:/c/Python39/Scripts:$PATH - -install: - - | - python -m pip install . - -script: - - | - python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py" From 4833abe5538882707ec6ecfe21f8e6a9b91b5e0f Mon Sep 17 00:00:00 2001 From: alon Date: Tue, 20 Jul 2021 19:14:28 +0300 Subject: [PATCH 05/25] Parallel indexing --- examples/searchKnnCloserFirst_test.cpp | 71 ++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/examples/searchKnnCloserFirst_test.cpp b/examples/searchKnnCloserFirst_test.cpp index cc1392c8..4e50b66f 100644 --- a/examples/searchKnnCloserFirst_test.cpp +++ b/examples/searchKnnCloserFirst_test.cpp @@ -9,12 +9,69 @@ #include #include +#include namespace { using idx_t = hnswlib::labeltype; + template + inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn) { + if (numThreads <= 0) { + numThreads = std::thread::hardware_concurrency(); + } + + if (numThreads == 1) { + for (size_t id = start; id < end; id++) { + fn(id, 0); + } + } else { + std::vector threads; + std::atomic current(start); + + // keep track of exceptions in threads + // https://stackoverflow.com/a/32428427/1713196 + std::exception_ptr lastException = nullptr; + std::mutex lastExceptMutex; + + for (size_t threadId = 0; threadId < numThreads; ++threadId) { + threads.push_back(std::thread([&, threadId] { + while (true) { + size_t id = current.fetch_add(1); + + if ((id >= end)) { + break; + } + + try { + fn(id, threadId); + } catch (...) { + std::unique_lock lastExcepLock(lastExceptMutex); + lastException = std::current_exception(); + /* + * This will work even when current is the largest value that + * size_t can fit, because fetch_add returns the previous value + * before the increment (what will result in overflow + * and produce 0 instead of current + 1). + */ + current = end; + break; + } + } + })); + } + for (auto &thread : threads) { + thread.join(); + } + if (lastException) { + std::rethrow_exception(lastException); + } + } + + + } + void test() { int d = 4; idx_t n = 100; @@ -40,10 +97,18 @@ void test() { hnswlib::AlgorithmInterface* alg_brute = new hnswlib::BruteforceSearch(&space, 2 * n); hnswlib::AlgorithmInterface* alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n); - for (size_t i = 0; i < n; ++i) { - alg_brute->addPoint(data.data() + d * i, i); +// for (size_t i = 0; i < n; ++i) { +// alg_brute->addPoint(data.data() + d * i, i); +// alg_hnsw->addPoint(data.data() + d * i, i); +// } + + ParallelFor(0, n, 4, [&](size_t i, size_t threadId) { alg_hnsw->addPoint(data.data() + d * i, i); - } + }); + + ParallelFor(0, n, 4, [&](size_t i, size_t threadId) { + alg_brute->addPoint(data.data() + d * i, i); + }); // test searchKnnCloserFirst of BruteforceSearch for (size_t j = 0; j < nq; ++j) { From a6af73d98d78659a46a6d3fec4c78af1e94be079 Mon Sep 17 00:00:00 2001 From: alon Date: Wed, 21 Jul 2021 17:54:50 +0300 Subject: [PATCH 06/25] Added python bindings for BF index for recall testing --- hnswlib/bruteforce.h | 3 +- python_bindings/bindings.cpp | 172 +++++++++++++++++++++++++++++++++++ 2 files changed, 173 insertions(+), 2 deletions(-) diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 24260400..7fbdee9a 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -68,8 +68,6 @@ namespace hnswlib { memcpy(data_ + size_per_element_ * idx, datapoint, data_size_); - - }; void removePoint(labeltype cur_external) { @@ -99,6 +97,7 @@ namespace hnswlib { dist_t lastdist = topResults.top().first; for (int i = k; i < cur_element_count; i++) { dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_); + if (dist <= lastdist) { topResults.push(std::pair(dist, *((labeltype *) (data_ + size_per_element_ * i + data_size_)))); diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 285b5185..22029019 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -652,6 +652,170 @@ class Index { }; +template +class BFIndex { +public: + BFIndex(const std::string &space_name, const int dim) : + space_name(space_name), dim(dim) { + normalize=false; + if(space_name=="l2") { + space = new hnswlib::L2Space(dim); + } + else if(space_name=="ip") { + space = new hnswlib::InnerProductSpace(dim); + } + else if(space_name=="cosine") { + space = new hnswlib::InnerProductSpace(dim); + normalize=true; + } else { + throw new std::runtime_error("Space name must be one of l2, ip, or cosine."); + } + alg = NULL; + index_inited = false; + } + + static const int ser_version = 1; // serialization version + + std::string space_name; + int dim; + bool index_inited; + bool normalize; + + hnswlib::labeltype cur_l; + hnswlib::BruteforceSearch *alg; + hnswlib::SpaceInterface *space; + + ~BFIndex() { + delete space; + if (alg) + delete alg; + } + + void init_new_index(const size_t maxElements) { + if (alg) { + throw new std::runtime_error("The index is already initiated."); + } + cur_l = 0; + alg = new hnswlib::BruteforceSearch(space, maxElements); + index_inited = true; + } + + void normalize_vector(float *data, float *norm_array){ + float norm=0.0f; + for(int i=0;i items(input); + auto buffer = items.request(); + size_t rows, features; + + if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array"); + if (buffer.ndim == 2) { + rows = buffer.shape[0]; + features = buffer.shape[1]; + } else { + rows = 1; + features = buffer.shape[0]; + } + + if (features != dim) + throw std::runtime_error("wrong dimensionality of the vectors"); + + std::vector ids; + + if (!ids_.is_none()) { + py::array_t < size_t, py::array::c_style | py::array::forcecast > items(ids_); + auto ids_numpy = items.request(); + if (ids_numpy.ndim == 1 && ids_numpy.shape[0] == rows) { + std::vector ids1(ids_numpy.shape[0]); + for (size_t i = 0; i < ids1.size(); i++) { + ids1[i] = items.data()[i]; + } + ids.swap(ids1); + } else if (ids_numpy.ndim == 0 && rows == 1) { + ids.push_back(*items.data()); + } else + throw std::runtime_error("wrong dimensionality of the labels"); + } + { + int start = 0; + py::gil_scoped_release l; + + std::vector norm_array(dim); + for (size_t i = start; i < rows; i++) { + alg->addPoint((void *) items.data(i), (size_t) i); + } + cur_l+=rows; + } + } + + void deletedVector(size_t label) { + alg->removePoint(label); + } + + py::object knnQuery_return_numpy(py::object input, size_t k = 1) { + + py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); + auto buffer = items.request(); + hnswlib::labeltype *data_numpy_l; + dist_t *data_numpy_d; + size_t rows, features; + { + py::gil_scoped_release l; + + if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array"); + if (buffer.ndim == 2) { + rows = buffer.shape[0]; + features = buffer.shape[1]; + } else { + rows = 1; + features = buffer.shape[0]; + } + + data_numpy_l = new hnswlib::labeltype[rows * k]; + data_numpy_d = new dist_t[rows * k]; + + for (size_t row = 0; row < rows; row++) { + std::priority_queue> result = alg->searchKnn( + (void *) items.data(row), k); + for (int i = k - 1; i >= 0; i--) { + auto &result_tuple = result.top(); + data_numpy_d[row * k + i] = result_tuple.first; + data_numpy_l[row * k + i] = result_tuple.second; + result.pop(); + } + } + } + + py::capsule free_when_done_l(data_numpy_l, [](void *f) { + delete[] f; + }); + py::capsule free_when_done_d(data_numpy_d, [](void *f) { + delete[] f; + }); + + + return py::make_tuple( + py::array_t( + {rows, k}, // shape + {k * sizeof(hnswlib::labeltype), + sizeof(hnswlib::labeltype)}, // C-style contiguous strides for double + data_numpy_l, // the data pointer + free_when_done_l), + py::array_t( + {rows, k}, // shape + {k * sizeof(dist_t), sizeof(dist_t)}, // C-style contiguous strides for double + data_numpy_d, // the data pointer + free_when_done_d)); + + } + +}; PYBIND11_PLUGIN(hnswlib) { @@ -716,5 +880,13 @@ PYBIND11_PLUGIN(hnswlib) { return ""; }); + py::class_>(m, "BFIndex") + .def(py::init(), py::arg("space"), py::arg("dim")) + .def("init_index", &BFIndex::init_new_index, py::arg("max_elements")) + .def("knn_query", &BFIndex::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1) + .def("add_items", &BFIndex::addItems, py::arg("data"), py::arg("ids") = py::none()) + .def("__repr__", [](const BFIndex &a) { + return ""; + }); return m.ptr(); } From d4c881da19589f58b318f043bbde9a01b397b6b7 Mon Sep 17 00:00:00 2001 From: alon Date: Thu, 22 Jul 2021 10:05:47 +0300 Subject: [PATCH 07/25] Add recall test for hnsw via python bindings --- examples/recall_test.py | 60 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 examples/recall_test.py diff --git a/examples/recall_test.py b/examples/recall_test.py new file mode 100644 index 00000000..feba3477 --- /dev/null +++ b/examples/recall_test.py @@ -0,0 +1,60 @@ +import hnswlib +import numpy as np + +dim = 128 +num_elements = 100000 +k = 10 +nun_queries = 10 + +# Generating sample data +data = np.float32(np.random.random((num_elements, dim))) + +# Declaring index +hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip +bf_index = hnswlib.BFIndex(space='l2', dim=dim) + +# Initing both hnsw and brute force indices +# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded +# during insertion of an element. +# The capacity can be increased by saving/loading the index, see below. +# +# hnsw construction params: +# ef_construction - controls index search speed/build speed tradeoff +# +# M - is tightly connected with internal dimensionality of the data. Strongly affects the memory consumption (~M) +# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction + +hnsw_index.init_index(max_elements=num_elements, ef_construction=10, M=6) +bf_index.init_index(max_elements=num_elements) + +# Controlling the recall for hnsw by setting ef: +# higher ef leads to better accuracy, but slower search +hnsw_index.set_ef(10) + +# Set number of threads used during batch search/construction in hnsw +# By default using all available cores +hnsw_index.set_num_threads(1) + +print("Adding batch of %d elements" % (len(data))) +hnsw_index.add_items(data) +bf_index.add_items(data) + +print("Indices built") + +# Generating query data +query_data = np.float32(np.random.random((10, dim))) + +# Query the elements and measure recall: +labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k) +labels_bf, distances_bf = bf_index.knn_query(query_data, k) + +# Measure recall +correct = 0 +for i in range(nun_queries): + for label in labels_hnsw[i]: + for correct_label in labels_bf[i]: + if label == correct_label: + correct += 1 + break + +print("recall is :", float(correct)/(k*nun_queries)) From 079c71e2d33d0ad48d16bee80dac444eb7b3d3c1 Mon Sep 17 00:00:00 2001 From: alon Date: Sun, 25 Jul 2021 16:02:03 +0300 Subject: [PATCH 08/25] Add load and store index to the bindings, update test recall --- examples/searchKnnCloserFirst_test.cpp | 71 +------------------ hnswlib/bruteforce.h | 3 +- python_bindings/__init__.py | 0 python_bindings/bindings.cpp | 33 +++++++-- .../tests/bindings_test_recall.py | 36 ++++++++-- 5 files changed, 64 insertions(+), 79 deletions(-) create mode 100644 python_bindings/__init__.py rename examples/recall_test.py => python_bindings/tests/bindings_test_recall.py (67%) diff --git a/examples/searchKnnCloserFirst_test.cpp b/examples/searchKnnCloserFirst_test.cpp index 4e50b66f..cc1392c8 100644 --- a/examples/searchKnnCloserFirst_test.cpp +++ b/examples/searchKnnCloserFirst_test.cpp @@ -9,69 +9,12 @@ #include #include -#include namespace { using idx_t = hnswlib::labeltype; - template - inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn) { - if (numThreads <= 0) { - numThreads = std::thread::hardware_concurrency(); - } - - if (numThreads == 1) { - for (size_t id = start; id < end; id++) { - fn(id, 0); - } - } else { - std::vector threads; - std::atomic current(start); - - // keep track of exceptions in threads - // https://stackoverflow.com/a/32428427/1713196 - std::exception_ptr lastException = nullptr; - std::mutex lastExceptMutex; - - for (size_t threadId = 0; threadId < numThreads; ++threadId) { - threads.push_back(std::thread([&, threadId] { - while (true) { - size_t id = current.fetch_add(1); - - if ((id >= end)) { - break; - } - - try { - fn(id, threadId); - } catch (...) { - std::unique_lock lastExcepLock(lastExceptMutex); - lastException = std::current_exception(); - /* - * This will work even when current is the largest value that - * size_t can fit, because fetch_add returns the previous value - * before the increment (what will result in overflow - * and produce 0 instead of current + 1). - */ - current = end; - break; - } - } - })); - } - for (auto &thread : threads) { - thread.join(); - } - if (lastException) { - std::rethrow_exception(lastException); - } - } - - - } - void test() { int d = 4; idx_t n = 100; @@ -97,18 +40,10 @@ void test() { hnswlib::AlgorithmInterface* alg_brute = new hnswlib::BruteforceSearch(&space, 2 * n); hnswlib::AlgorithmInterface* alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n); -// for (size_t i = 0; i < n; ++i) { -// alg_brute->addPoint(data.data() + d * i, i); -// alg_hnsw->addPoint(data.data() + d * i, i); -// } - - ParallelFor(0, n, 4, [&](size_t i, size_t threadId) { - alg_hnsw->addPoint(data.data() + d * i, i); - }); - - ParallelFor(0, n, 4, [&](size_t i, size_t threadId) { + for (size_t i = 0; i < n; ++i) { alg_brute->addPoint(data.data() + d * i, i); - }); + alg_hnsw->addPoint(data.data() + d * i, i); + } // test searchKnnCloserFirst of BruteforceSearch for (size_t j = 0; j < nq; ++j) { diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 7fbdee9a..24260400 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -68,6 +68,8 @@ namespace hnswlib { memcpy(data_ + size_per_element_ * idx, datapoint, data_size_); + + }; void removePoint(labeltype cur_external) { @@ -97,7 +99,6 @@ namespace hnswlib { dist_t lastdist = topResults.top().first; for (int i = k; i < cur_element_count; i++) { dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_); - if (dist <= lastdist) { topResults.push(std::pair(dist, *((labeltype *) (data_ + size_per_element_ * i + data_size_)))); diff --git a/python_bindings/__init__.py b/python_bindings/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 22029019..f0761640 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -743,21 +743,39 @@ class BFIndex { throw std::runtime_error("wrong dimensionality of the labels"); } { - int start = 0; - py::gil_scoped_release l; - std::vector norm_array(dim); - for (size_t i = start; i < rows; i++) { - alg->addPoint((void *) items.data(i), (size_t) i); + for (size_t row = 0; row < rows; row++) { + size_t id = ids.size() ? ids.at(row) : cur_l + row; + if (!normalize) { + alg->addPoint((void *) items.data(row), (size_t) id); + } else { + float normalized_vector[dim]; + normalize_vector((float *)items.data(row), normalized_vector); + alg->addPoint((void *) normalized_vector, (size_t) id); + } } cur_l+=rows; } } - void deletedVector(size_t label) { + void deleteVector(size_t label) { alg->removePoint(label); } + void saveIndex(const std::string &path_to_index) { + alg->saveIndex(path_to_index); + } + + void loadIndex(const std::string &path_to_index, size_t max_elements) { + if (alg) { + std::cerr<<"Warning: Calling load_index for an already inited index. Old index is being deallocated."; + delete alg; + } + alg = new hnswlib::BruteforceSearch(space, path_to_index); + cur_l = alg->cur_element_count; + index_inited = true; + } + py::object knnQuery_return_numpy(py::object input, size_t k = 1) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); @@ -885,6 +903,9 @@ PYBIND11_PLUGIN(hnswlib) { .def("init_index", &BFIndex::init_new_index, py::arg("max_elements")) .def("knn_query", &BFIndex::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1) .def("add_items", &BFIndex::addItems, py::arg("data"), py::arg("ids") = py::none()) + .def("delete_vector", &BFIndex::deleteVector, py::arg("label")) + .def("save_index", &BFIndex::saveIndex, py::arg("path_to_index")) + .def("load_index", &BFIndex::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0) .def("__repr__", [](const BFIndex &a) { return ""; }); diff --git a/examples/recall_test.py b/python_bindings/tests/bindings_test_recall.py similarity index 67% rename from examples/recall_test.py rename to python_bindings/tests/bindings_test_recall.py index feba3477..3742fcdd 100644 --- a/examples/recall_test.py +++ b/python_bindings/tests/bindings_test_recall.py @@ -1,7 +1,7 @@ import hnswlib import numpy as np -dim = 128 +dim = 32 num_elements = 100000 k = 10 nun_queries = 10 @@ -24,12 +24,12 @@ # M - is tightly connected with internal dimensionality of the data. Strongly affects the memory consumption (~M) # Higher M leads to higher accuracy/run_time at fixed ef/efConstruction -hnsw_index.init_index(max_elements=num_elements, ef_construction=10, M=6) +hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16) bf_index.init_index(max_elements=num_elements) # Controlling the recall for hnsw by setting ef: # higher ef leads to better accuracy, but slower search -hnsw_index.set_ef(10) +hnsw_index.set_ef(200) # Set number of threads used during batch search/construction in hnsw # By default using all available cores @@ -42,7 +42,7 @@ print("Indices built") # Generating query data -query_data = np.float32(np.random.random((10, dim))) +query_data = np.float32(np.random.random((nun_queries, dim))) # Query the elements and measure recall: labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k) @@ -58,3 +58,31 @@ break print("recall is :", float(correct)/(k*nun_queries)) + +# test serializing the brute force index +index_path = 'bf_index.bin' +print("Saving index to '%s'" % index_path) +bf_index.save_index(index_path) +del bf_index + +# Re-initiating, loading the index +bf_index = hnswlib.BFIndex(space='l2', dim=dim) + +print("\nLoading index from '%s'\n" % index_path) +bf_index.load_index(index_path) + +# Query the brute force index again to verify that we get the same results +labels_bf, distances_bf = bf_index.knn_query(query_data, k) + +# Measure recall +correct = 0 +for i in range(nun_queries): + for label in labels_hnsw[i]: + for correct_label in labels_bf[i]: + if label == correct_label: + correct += 1 + break + +print("recall after reloading is :", float(correct)/(k*nun_queries)) + + From 9c2dc7cea7dcab1cdfe3312508787ebb249fc5ac Mon Sep 17 00:00:00 2001 From: alon Date: Sun, 25 Jul 2021 16:44:05 +0300 Subject: [PATCH 09/25] Adding documentation --- TESTING_RECALL.md | 91 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 TESTING_RECALL.md diff --git a/TESTING_RECALL.md b/TESTING_RECALL.md new file mode 100644 index 00000000..23a6f654 --- /dev/null +++ b/TESTING_RECALL.md @@ -0,0 +1,91 @@ +# Testing recall + +Selecting HNSW parameters for a specific use case highly impacts the search quality. One way to test the quality of the constructed index is to compare the HNSW search results to the actual results (i.e., the actual `k` nearest neighbors). +For that cause, the API enables creating a simple "brute-force" index in which vectors are stored as is, and searching for the `k` nearest neighbors to a query vector requires going over the entire index. +Comparing between HNSW and brute-force results may help with finding the desired HNSW parameters for achieving a satisfying recall, based on the index size and data dimension. + +### Brute force index API +`hnswlib.BFIndex(space, dim)` creates a non-initialized index in space `space` with integer dimension `dim`. + +`hnswlib.BFIndex` methods: + +`init_index(max_elements)` initializes the index with no elements. + +max_elements defines the maximum number of elements that can be stored in the structure. + +`add_items(data, ids)` inserts the data (numpy array of vectors, shape:`N*dim`) into the structure. +`ids` are optional N-size numpy array of integer labels for all elements in data. + +`delete_vector(label)` delete the element associated with the given `label` so it will be omitted from search results. + +`knn_query(data, k = 1)` make a batch query for `k `closest elements for each element of the +`data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`). + +`load_index(path_to_index, max_elements = 0)` loads the index from persistence to the uninitialized index. + +`save_index(path_to_index)` saves the index from persistence. + +### measuring recall example + +``` +import hnswlib +import numpy as np + +dim = 32 +num_elements = 100000 +k = 10 +nun_queries = 10 + +# Generating sample data +data = np.float32(np.random.random((num_elements, dim))) + +# Declaring index +hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip +bf_index = hnswlib.BFIndex(space='l2', dim=dim) + +# Initing both hnsw and brute force indices +# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded +# during insertion of an element. +# The capacity can be increased by saving/loading the index, see below. +# +# hnsw construction params: +# ef_construction - controls index search speed/build speed tradeoff +# +# M - is tightly connected with internal dimensionality of the data. Strongly affects the memory consumption (~M) +# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction + +hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16) +bf_index.init_index(max_elements=num_elements) + +# Controlling the recall for hnsw by setting ef: +# higher ef leads to better accuracy, but slower search +hnsw_index.set_ef(200) + +# Set number of threads used during batch search/construction in hnsw +# By default using all available cores +hnsw_index.set_num_threads(1) + +print("Adding batch of %d elements" % (len(data))) +hnsw_index.add_items(data) +bf_index.add_items(data) + +print("Indices built") + +# Generating query data +query_data = np.float32(np.random.random((nun_queries, dim))) + +# Query the elements and measure recall: +labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k) +labels_bf, distances_bf = bf_index.knn_query(query_data, k) + +# Measure recall +correct = 0 +for i in range(nun_queries): + for label in labels_hnsw[i]: + for correct_label in labels_bf[i]: + if label == correct_label: + correct += 1 + break + +print("recall is :", float(correct)/(k*nun_queries)) +``` From cb399cf94e5b58910ce71270f5c9932d7bd672f5 Mon Sep 17 00:00:00 2001 From: Martin Dimitrov Date: Wed, 1 Sep 2021 23:28:23 +0000 Subject: [PATCH 10/25] Added AVX512 support for space_l2 and space_ip. --- hnswlib/hnswlib.h | 6 ++++++ hnswlib/space_ip.h | 39 ++++++++++++++++++++++++++++++++++++--- hnswlib/space_l2.h | 39 +++++++++++++++++++++++++++++++++++---- 3 files changed, 77 insertions(+), 7 deletions(-) diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 9409c388..2628d939 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -4,6 +4,8 @@ #define USE_SSE #ifdef __AVX__ #define USE_AVX +#ifdef __AVX512__ +#define USE_AVX512 #endif #endif #endif @@ -16,6 +18,10 @@ #include #endif +#if defined(USE_AVX512) +#include +#endif + #if defined(__GNUC__) #define PORTABLE_ALIGN32 __attribute__((aligned(32))) #else diff --git a/hnswlib/space_ip.h b/hnswlib/space_ip.h index d0497ff7..142b90f5 100644 --- a/hnswlib/space_ip.h +++ b/hnswlib/space_ip.h @@ -124,7 +124,40 @@ namespace hnswlib { #endif -#if defined(USE_AVX) + +#if defined(USE_AVX512) + + static float + InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + float PORTABLE_ALIGN32 TmpRes[16]; + float *pVect1 = (float *) pVect1v; + float *pVect2 = (float *) pVect2v; + size_t qty = *((size_t *) qty_ptr); + + size_t qty16 = qty / 16; + + + const float *pEnd1 = pVect1 + 16 * qty16; + + __m512 sum512 = _mm512_set1_ps(0); + + while (pVect1 < pEnd1) { + //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0); + + __m512 v1 = _mm512_loadu_ps(pVect1); + pVect1 += 16; + __m512 v2 = _mm512_loadu_ps(pVect2); + pVect2 += 16; + sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2)); + } + + _mm512_store_ps(TmpRes, sum512); + float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15]; + + return 1.0f - sum; + } + +#elif defined(USE_AVX) static float InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { @@ -211,7 +244,7 @@ namespace hnswlib { #endif -#if defined(USE_SSE) || defined(USE_AVX) +#if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512) static float InnerProductSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { size_t qty = *((size_t *) qty_ptr); @@ -249,7 +282,7 @@ namespace hnswlib { public: InnerProductSpace(size_t dim) { fstdistfunc_ = InnerProduct; - #if defined(USE_AVX) || defined(USE_SSE) + #if defined(USE_AVX) || defined(USE_SSE) || defined(USE_AVX512) if (dim % 16 == 0) fstdistfunc_ = InnerProductSIMD16Ext; else if (dim % 4 == 0) diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h index e86e13b0..1687f93f 100644 --- a/hnswlib/space_l2.h +++ b/hnswlib/space_l2.h @@ -19,7 +19,38 @@ namespace hnswlib { return (res); } -#if defined(USE_AVX) +#if defined(USE_AVX512) + + // Favor using AVX512 if available. + static float + L2SqrSIMD16Ext(float *pVect1, float *pVect2, size_t qty) { + float PORTABLE_ALIGN32 TmpRes[16]; + size_t qty16 = qty >> 4; + + const float *pEnd1 = pVect1 + (qty16 << 4); + + __m512 diff, v1, v2; + __m512 sum = _mm512_set1_ps(0); + + while (pVect1 < pEnd1) { + v1 = _mm512_loadu_ps(pVect1); + pVect1 += 16; + v2 = _mm512_loadu_ps(pVect2); + pVect2 += 16; + diff = _mm512_sub_ps(v1, v2); + // sum = _mm512_fmadd_ps(diff, diff, sum); + sum = _mm512_add_ps(sum, _mm512_mul_ps(diff, diff)); + } + + _mm512_store_ps(TmpRes, sum); + float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + + TmpRes[13] + TmpRes[14] + TmpRes[15]; + + return (res); +} + +#elif defined(USE_AVX) // Favor using AVX if available. static float @@ -106,7 +137,7 @@ namespace hnswlib { } #endif -#if defined(USE_SSE) || defined(USE_AVX) +#if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512) static float L2SqrSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { size_t qty = *((size_t *) qty_ptr); @@ -174,7 +205,7 @@ namespace hnswlib { public: L2Space(size_t dim) { fstdistfunc_ = L2Sqr; - #if defined(USE_SSE) || defined(USE_AVX) + #if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512) if (dim % 16 == 0) fstdistfunc_ = L2SqrSIMD16Ext; else if (dim % 4 == 0) @@ -278,4 +309,4 @@ namespace hnswlib { }; -} \ No newline at end of file +} From 677700fba3e068b31989ddb8e632f8e87a44c150 Mon Sep 17 00:00:00 2001 From: Martin Dimitrov Date: Fri, 10 Sep 2021 17:50:00 +0000 Subject: [PATCH 11/25] fixed missing #endif. --- hnswlib/hnswlib.h | 1 + 1 file changed, 1 insertion(+) diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 2628d939..9ece06d6 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -9,6 +9,7 @@ #endif #endif #endif +#endif #if defined(USE_AVX) || defined(USE_SSE) #ifdef _MSC_VER From d6c8e3a3896129a7f642bc23cdce4536e8b45a65 Mon Sep 17 00:00:00 2001 From: LTLA Date: Wed, 15 Sep 2021 23:05:10 -0700 Subject: [PATCH 12/25] Lowered minimum CMake version back to 2.6. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ec91f568..e2f3d716 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required (VERSION 3.14) +cmake_minimum_required (VERSION 2.6) project(hnsw_lib LANGUAGES CXX) From e7935b75a01d3c6f1f2cad7a70a2ae549af10f89 Mon Sep 17 00:00:00 2001 From: Martin Dimitrov Date: Thu, 16 Sep 2021 15:45:00 +0000 Subject: [PATCH 13/25] Corrently check for the presense of AVX512F (Foundation set) at compile time. Check for the __AVX512F__ define. All the AVX512 instructions that we are using are part of the Foundation set. --- hnswlib/hnswlib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 9ece06d6..0f5c6210 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -4,7 +4,7 @@ #define USE_SSE #ifdef __AVX__ #define USE_AVX -#ifdef __AVX512__ +#ifdef __AVX512F__ #define USE_AVX512 #endif #endif From d7bec60e66d6ec4d8eefcf18a1ee3b4bf03a667d Mon Sep 17 00:00:00 2001 From: Martin Dimitrov Date: Thu, 16 Sep 2021 16:00:25 +0000 Subject: [PATCH 14/25] Fixed a bug where we are aligning the TmpRes[16] variable to 32 bytes, using PORTABLE_ALIGN32. However, we need to align it to 64 bytes becase of the use of unaligned store instruction _mm512_store_ps . --- hnswlib/hnswlib.h | 2 ++ hnswlib/space_ip.h | 2 +- hnswlib/space_l2.h | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 0f5c6210..d95d6f88 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -25,8 +25,10 @@ #if defined(__GNUC__) #define PORTABLE_ALIGN32 __attribute__((aligned(32))) +#define PORTABLE_ALIGN64 __attribute__((aligned(64))) #else #define PORTABLE_ALIGN32 __declspec(align(32)) +#define PORTABLE_ALIGN64 __declspec(align(64)) #endif #endif diff --git a/hnswlib/space_ip.h b/hnswlib/space_ip.h index 142b90f5..c0029bde 100644 --- a/hnswlib/space_ip.h +++ b/hnswlib/space_ip.h @@ -129,7 +129,7 @@ namespace hnswlib { static float InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { - float PORTABLE_ALIGN32 TmpRes[16]; + float PORTABLE_ALIGN64 TmpRes[16]; float *pVect1 = (float *) pVect1v; float *pVect2 = (float *) pVect2v; size_t qty = *((size_t *) qty_ptr); diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h index 1687f93f..6ba2d41a 100644 --- a/hnswlib/space_l2.h +++ b/hnswlib/space_l2.h @@ -24,7 +24,7 @@ namespace hnswlib { // Favor using AVX512 if available. static float L2SqrSIMD16Ext(float *pVect1, float *pVect2, size_t qty) { - float PORTABLE_ALIGN32 TmpRes[16]; + float PORTABLE_ALIGN64 TmpRes[16]; size_t qty16 = qty >> 4; const float *pEnd1 = pVect1 + (qty16 << 4); From 05de2442e08708abefe0e054d5c5684b19b1777a Mon Sep 17 00:00:00 2001 From: Martin Dimitrov Date: Wed, 29 Sep 2021 14:02:19 +0000 Subject: [PATCH 15/25] changed float * to void * in L2SqrSIMD16Ext --- hnswlib/space_l2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h index 6ba2d41a..be301ffd 100644 --- a/hnswlib/space_l2.h +++ b/hnswlib/space_l2.h @@ -23,7 +23,7 @@ namespace hnswlib { // Favor using AVX512 if available. static float - L2SqrSIMD16Ext(float *pVect1, float *pVect2, size_t qty) { + L2SqrSIMD16Ext(void *pVect1, void *pVect2, size_t qty) { float PORTABLE_ALIGN64 TmpRes[16]; size_t qty16 = qty >> 4; From 290f3e24f900ed35c3746402500c5255c2530679 Mon Sep 17 00:00:00 2001 From: Martin Dimitrov Date: Fri, 1 Oct 2021 05:02:36 +0000 Subject: [PATCH 16/25] fixed errors with const void* conversion --- hnswlib/space_l2.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h index be301ffd..3b6a49ef 100644 --- a/hnswlib/space_l2.h +++ b/hnswlib/space_l2.h @@ -23,7 +23,10 @@ namespace hnswlib { // Favor using AVX512 if available. static float - L2SqrSIMD16Ext(void *pVect1, void *pVect2, size_t qty) { + L2SqrSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + float *pVect1 = (float *) pVect1v; + float *pVect2 = (float *) pVect2v; + size_t qty = *((size_t *) qty_ptr); float PORTABLE_ALIGN64 TmpRes[16]; size_t qty16 = qty >> 4; From ac43973f1f27b809d6ce181130d9aa4f95050057 Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Sun, 3 Oct 2021 22:52:08 -0700 Subject: [PATCH 17/25] add a test for distance computation correctness --- python_bindings/tests/bindings_test_spaces.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 python_bindings/tests/bindings_test_spaces.py diff --git a/python_bindings/tests/bindings_test_spaces.py b/python_bindings/tests/bindings_test_spaces.py new file mode 100644 index 00000000..3af5c5c0 --- /dev/null +++ b/python_bindings/tests/bindings_test_spaces.py @@ -0,0 +1,40 @@ +import unittest + +import numpy as np + +import hnswlib + +class RandomSelfTestCase(unittest.TestCase): + def testRandomSelf(self): + + data1 = np.asarray([[1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [1, 0, 1], + [1, 1, 1], + ]) + + for space, expected_distances in [ + ('l2', [[0., 1., 2., 2., 2.]]), + ('ip', [[-2., -1., 0., 0., 0.]]), + ('cosine', [[0, 1.835e-01, 4.23e-01, 4.23e-01, 4.23e-01]])]: + + for rightdim in range(1, 128, 3): + for leftdim in range(1, 32, 5): + data2 = np.concatenate( + [np.zeros([data1.shape[0], leftdim]), data1, np.zeros([data1.shape[0], rightdim])], axis=1) + dim = data2.shape[1] + p = hnswlib.Index(space=space, dim=dim) + p.init_index(max_elements=5, ef_construction=100, M=16) + + p.set_ef(10) + + p.add_items(data2) + + # Query the elements for themselves and measure recall: + labels, distances = p.knn_query(np.asarray(data2[-1:]), k=5) + + + diff=np.mean(np.abs(distances-expected_distances)) + print(dim,space, diff) + self.assertAlmostEqual(diff, 0, delta=1e-3) From a29c3dd736b09b6d9e3a1bbac2d96d250a65c759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EB=B0=95=EC=84=A0=EC=98=81?= Date: Mon, 25 Oct 2021 18:44:48 +0900 Subject: [PATCH 18/25] fix : a sufficient results may occur(has marked_deleted node) If the top_candidate is insufficient, the problem can be supplemented by allowing the cadidate_set to search further when there is data. --- hnswlib/hnswalg.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 2bd2eb5e..1a1fbb4c 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -182,7 +182,7 @@ namespace hnswlib { while (!candidateSet.empty()) { std::pair curr_el_pair = candidateSet.top(); - if ((-curr_el_pair.first) > lowerBound) { + if ((-curr_el_pair.first) > lowerBound and top_candidates.size() == ef_construction_) { break; } candidateSet.pop(); @@ -271,7 +271,7 @@ namespace hnswlib { std::pair current_node_pair = candidate_set.top(); - if ((-current_node_pair.first) > lowerBound) { + if ((-current_node_pair.first) > lowerBound and top_candidates.size() == ef) { break; } candidate_set.pop(); From c2bc2ad1d66f54709674c4d7762c465d3fcef056 Mon Sep 17 00:00:00 2001 From: dorosy-yeong Date: Tue, 26 Oct 2021 14:55:39 +0900 Subject: [PATCH 19/25] and => && --- hnswlib/hnswalg.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 1a1fbb4c..ce29414d 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -182,7 +182,7 @@ namespace hnswlib { while (!candidateSet.empty()) { std::pair curr_el_pair = candidateSet.top(); - if ((-curr_el_pair.first) > lowerBound and top_candidates.size() == ef_construction_) { + if ((-curr_el_pair.first) > lowerBound && top_candidates.size() == ef_construction_) { break; } candidateSet.pop(); @@ -271,7 +271,7 @@ namespace hnswlib { std::pair current_node_pair = candidate_set.top(); - if ((-current_node_pair.first) > lowerBound and top_candidates.size() == ef) { + if ((-current_node_pair.first) > lowerBound && top_candidates.size() == ef) { break; } candidate_set.pop(); From 8c6960b421a6a4c0793597080626605d9af6118a Mon Sep 17 00:00:00 2001 From: dorosy-yeong Date: Mon, 8 Nov 2021 09:46:56 +0900 Subject: [PATCH 20/25] fix a performance degradation. (To check that there is a preprocessor flag has_deletions which allows avoiding executing the code for deletes when there are none. ) --- hnswlib/hnswalg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index ce29414d..81b208c8 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -271,7 +271,7 @@ namespace hnswlib { std::pair current_node_pair = candidate_set.top(); - if ((-current_node_pair.first) > lowerBound && top_candidates.size() == ef) { + if ((-current_node_pair.first) > lowerBound && (top_candidates.size() == ef || has_deletions == false)) { break; } candidate_set.pop(); From 342257ef5cd1828845ee5d10abe65bd7d4af34f9 Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Sat, 6 Nov 2021 21:08:04 -0700 Subject: [PATCH 21/25] make test faster, fix few bugs --- python_bindings/tests/bindings_test_labels.py | 2 +- python_bindings/tests/bindings_test_pickle.py | 17 ++++++++--------- python_bindings/tests/bindings_test_spaces.py | 3 +-- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/python_bindings/tests/bindings_test_labels.py b/python_bindings/tests/bindings_test_labels.py index 668d7694..0488e9fb 100644 --- a/python_bindings/tests/bindings_test_labels.py +++ b/python_bindings/tests/bindings_test_labels.py @@ -8,7 +8,7 @@ class RandomSelfTestCase(unittest.TestCase): def testRandomSelf(self): - for idx in range(16): + for idx in range(2): print("\n**** Index save-load test ****\n") np.random.seed(idx) diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py index 07820b1d..1fa0e822 100644 --- a/python_bindings/tests/bindings_test_pickle.py +++ b/python_bindings/tests/bindings_test_pickle.py @@ -124,13 +124,12 @@ def test_space_main(self, space, dim): class PickleUnitTests(unittest.TestCase): def setUp(self): + self.ef_construction = 200 + self.M = 32 + self.ef = 400 - self.ef_construction = 725 - self.M = 64 - self.ef = 725 - - self.num_elements = 5000 - self.num_test_elements = 200 + self.num_elements = 1000 + self.num_test_elements = 100 self.num_threads = 4 self.k = 25 @@ -143,10 +142,10 @@ def setUp(self): # i.e., number of values that are (d1-d2)**2>1e-3 def test_inner_product_space(self): - test_space_main(self, 'ip', 48) + test_space_main(self, 'ip', 16) def test_l2_space(self): - test_space_main(self, 'l2', 153) + test_space_main(self, 'l2', 53) def test_cosine_space(self): - test_space_main(self, 'cosine', 512) + test_space_main(self, 'cosine', 32) diff --git a/python_bindings/tests/bindings_test_spaces.py b/python_bindings/tests/bindings_test_spaces.py index 3af5c5c0..c3cceb87 100644 --- a/python_bindings/tests/bindings_test_spaces.py +++ b/python_bindings/tests/bindings_test_spaces.py @@ -35,6 +35,5 @@ def testRandomSelf(self): labels, distances = p.knn_query(np.asarray(data2[-1:]), k=5) - diff=np.mean(np.abs(distances-expected_distances)) - print(dim,space, diff) + diff=np.mean(np.abs(distances-expected_distances)) self.assertAlmostEqual(diff, 0, delta=1e-3) From 36d00bfc0a09e898db9769fcdfd20ad031e01c5a Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Mon, 15 Nov 2021 22:17:32 -0800 Subject: [PATCH 22/25] Fix failing windows build (#346) fix failing windows build --- python_bindings/bindings.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index f0761640..c3f6746b 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -749,9 +749,9 @@ class BFIndex { if (!normalize) { alg->addPoint((void *) items.data(row), (size_t) id); } else { - float normalized_vector[dim]; - normalize_vector((float *)items.data(row), normalized_vector); - alg->addPoint((void *) normalized_vector, (size_t) id); + std::vector normalized_vector(dim); + normalize_vector((float *)items.data(row), normalized_vector.data()); + alg->addPoint((void *) normalized_vector.data(), (size_t) id); } } cur_l+=rows; From 9006b32350312924f78447bd8eb14aea9a0a92cc Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Thu, 29 Jul 2021 14:19:27 +0300 Subject: [PATCH 23/25] Unmark deleted --- .gitignore | 3 + hnswlib/hnswalg.h | 80 +++++++++++-------- python_bindings/bindings.cpp | 55 ++++++------- python_bindings/tests/bindings_test_labels.py | 19 +++-- 4 files changed, 88 insertions(+), 69 deletions(-) diff --git a/.gitignore b/.gitignore index d2cde965..dab30385 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ python_bindings/tests/__pycache__/ *.pyd hnswlib.cpython*.so var/ +.idea/ +.vscode/ + diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index f23c17d9..f2a8b9dc 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -18,7 +18,6 @@ namespace hnswlib { public: static const tableint max_update_element_locks = 65536; HierarchicalNSW(SpaceInterface *s) { - } HierarchicalNSW(SpaceInterface *s, const std::string &location, bool nmslib = false, size_t max_elements=0) { @@ -29,7 +28,7 @@ namespace hnswlib { link_list_locks_(max_elements), link_list_update_locks_(max_update_element_locks), element_levels_(max_elements) { max_elements_ = max_elements; - has_deletions_=false; + num_deleted_ = 0; data_size_ = s->get_data_size(); fstdistfunc_ = s->get_dist_func(); dist_func_param_ = s->get_dist_func_param(); @@ -56,8 +55,6 @@ namespace hnswlib { visited_list_pool_ = new VisitedListPool(1, max_elements); - - //initializations for special treatment of the first node enterpoint_node_ = -1; maxlevel_ = -1; @@ -92,6 +89,7 @@ namespace hnswlib { size_t cur_element_count; size_t size_data_per_element_; size_t size_links_per_element_; + size_t num_deleted_; size_t M_; size_t maxM_; @@ -112,20 +110,15 @@ namespace hnswlib { std::vector link_list_update_locks_; tableint enterpoint_node_; - size_t size_links_level0_; size_t offsetData_, offsetLevel0_; - char *data_level0_memory_; char **linkLists_; std::vector element_levels_; size_t data_size_; - bool has_deletions_; - - size_t label_offset_; DISTFUNC fstdistfunc_; void *dist_func_param_; @@ -547,7 +540,7 @@ namespace hnswlib { } } - if (has_deletions_) { + if (num_deleted_) { std::priority_queue> top_candidates1=searchBaseLayerST(currObj, query_data, ef_); top_candidates.swap(top_candidates1); @@ -623,8 +616,6 @@ namespace hnswlib { } void loadIndex(const std::string &location, SpaceInterface *s, size_t max_elements_i=0) { - - std::ifstream input(location, std::ios::binary); if (!input.is_open()) @@ -639,7 +630,7 @@ namespace hnswlib { readBinaryPOD(input, max_elements_); readBinaryPOD(input, cur_element_count); - size_t max_elements=max_elements_i; + size_t max_elements = max_elements_i; if(max_elements < cur_element_count) max_elements = max_elements_; max_elements_ = max_elements; @@ -688,26 +679,19 @@ namespace hnswlib { input.seekg(pos,input.beg); - data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_); if (data_level0_memory_ == nullptr) throw std::runtime_error("Not enough memory: loadIndex failed to allocate level0"); input.read(data_level0_memory_, cur_element_count * size_data_per_element_); - - - size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); - size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint); std::vector(max_elements).swap(link_list_locks_); std::vector(max_update_element_locks).swap(link_list_update_locks_); - visited_list_pool_ = new VisitedListPool(1, max_elements); - linkLists_ = (char **) malloc(sizeof(void *) * max_elements); if (linkLists_ == nullptr) throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists"); @@ -731,11 +715,9 @@ namespace hnswlib { } } - has_deletions_=false; - for (size_t i = 0; i < cur_element_count; i++) { if(isMarkedDeleted(i)) - has_deletions_=true; + num_deleted_ += 1; } input.close(); @@ -765,19 +747,19 @@ namespace hnswlib { } static const unsigned char DELETE_MARK = 0x01; -// static const unsigned char REUSE_MARK = 0x10; + // static const unsigned char REUSE_MARK = 0x10; /** * Marks an element with the given label deleted, does NOT really change the current graph. * @param label */ void markDelete(labeltype label) { - has_deletions_=true; auto search = label_lookup_.find(label); if (search == label_lookup_.end()) { throw std::runtime_error("Label not found"); } - markDeletedInternal(search->second); + tableint internalId = search->second; + markDeletedInternal(internalId); } /** @@ -786,8 +768,31 @@ namespace hnswlib { * @param internalId */ void markDeletedInternal(tableint internalId) { - unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId))+2; - *ll_cur |= DELETE_MARK; + assert(internalId < cur_element_count); + if (!isMarkedDeleted(internalId)) + { + unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId))+2; + *ll_cur |= DELETE_MARK; + num_deleted_ += 1; + } + else + { + throw std::runtime_error("The requested to delete element is already deleted"); + } + } + + /** + * Remove the deleted mark of the node, does NOT really change the current graph. + * @param label + */ + void unmarkDelete(labeltype label) + { + auto search = label_lookup_.find(label); + if (search == label_lookup_.end()) { + throw std::runtime_error("Label not found"); + } + tableint internalId = search->second; + unmarkDeletedInternal(internalId); } /** @@ -795,8 +800,17 @@ namespace hnswlib { * @param internalId */ void unmarkDeletedInternal(tableint internalId) { - unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId))+2; - *ll_cur &= ~DELETE_MARK; + assert(internalId < cur_element_count); + if (isMarkedDeleted(internalId)) + { + unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId))+2; + *ll_cur &= ~DELETE_MARK; + num_deleted_ -= 1; + } + else + { + throw std::runtime_error("The requested to undelete element is not deleted"); + } } /** @@ -857,8 +871,8 @@ namespace hnswlib { } for (auto&& neigh : sNeigh) { -// if (neigh == internalId) -// continue; + // if (neigh == internalId) + // continue; std::priority_queue, std::vector>, CompareByFirst> candidates; size_t size = sCand.find(neigh) == sCand.end() ? sCand.size() : sCand.size() - 1; // sCand guaranteed to have size >= 1 @@ -1133,7 +1147,7 @@ namespace hnswlib { } std::priority_queue, std::vector>, CompareByFirst> top_candidates; - if (has_deletions_) { + if (num_deleted_) { top_candidates=searchBaseLayerST( currObj, query_data, std::max(ef_, k)); } diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 285b5185..48fdf475 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -70,16 +70,14 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn std::rethrow_exception(lastException); } } - - } - inline void assert_true(bool expr, const std::string & msg) { - if (expr == false) - throw std::runtime_error("Unpickle Error: "+msg); - return; - } +inline void assert_true(bool expr, const std::string & msg) { + if (expr == false) + throw std::runtime_error("Unpickle Error: "+msg); + return; +} template @@ -141,14 +139,12 @@ class Index { seed=random_seed; } - void set_ef(size_t ef) { default_ef=ef; if (appr_alg) appr_alg->ef_ = ef; } - void set_num_threads(int num_threads) { this->num_threads_default = num_threads; } @@ -207,14 +203,14 @@ class Index { if (!ids_.is_none()) { py::array_t < size_t, py::array::c_style | py::array::forcecast > items(ids_); auto ids_numpy = items.request(); - if(ids_numpy.ndim==1 && ids_numpy.shape[0]==rows) { + if(ids_numpy.ndim == 1 && ids_numpy.shape[0] == rows) { std::vector ids1(ids_numpy.shape[0]); for (size_t i = 0; i < ids1.size(); i++) { ids1[i] = items.data()[i]; } ids.swap(ids1); } - else if(ids_numpy.ndim==0 && rows==1) { + else if(ids_numpy.ndim == 0 && rows == 1) { ids.push_back(*items.data()); } else @@ -227,7 +223,7 @@ class Index { int start = 0; if (!ep_added) { size_t id = ids.size() ? ids.at(0) : (cur_l); - float *vector_data=(float *) items.data(0); + float *vector_data = (float *) items.data(0); std::vector norm_array(dim); if(normalize){ normalize_vector(vector_data, norm_array.data()); @@ -279,7 +275,6 @@ class Index { } std::vector getIdsList() { - std::vector ids; for(auto kv : appr_alg->label_lookup_) { @@ -290,9 +285,6 @@ class Index { py::dict getAnnData() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ - - - std::unique_lock templock(appr_alg->global); unsigned int level0_npy_size = appr_alg->cur_element_count * appr_alg->size_data_per_element_; @@ -369,7 +361,7 @@ class Index { "mult"_a=appr_alg->mult_, "ef_construction"_a=appr_alg->ef_construction_, "ef"_a=appr_alg->ef_, - "has_deletions"_a=appr_alg->has_deletions_, + "has_deletions"_a=(bool)appr_alg->num_deleted_, "size_links_per_element"_a=appr_alg->size_links_per_element_, "label_lookup_external"_a=py::array_t( @@ -402,10 +394,7 @@ class Index { {sizeof(char)}, // C-style contiguous strides for double link_list_npy, // the data pointer free_when_done_ll) - ); - - } @@ -431,7 +420,6 @@ class Index { static Index * createFromParams(const py::dict d) { - // check serialization version assert_true(((int)py::int_(Index::ser_version)) >= d["ser_version"].cast(), "Invalid serialization version!"); @@ -466,8 +454,6 @@ class Index { } void setAnnData(const py::dict d) { /* WARNING: Index::setAnnData is not thread-safe with Index::addItems */ - - std::unique_lock templock(appr_alg->global); assert_true(appr_alg->offsetLevel0_ == d["offset_level0"].cast(), "Invalid value of offsetLevel0_ "); @@ -489,7 +475,6 @@ class Index { assert_true(appr_alg->ef_construction_ == d["ef_construction"].cast(), "Invalid value of ef_construction_ "); appr_alg->ef_ = d["ef"].cast(); - appr_alg->has_deletions_=d["has_deletions"].cast(); assert_true(appr_alg->size_links_per_element_ == d["size_links_per_element"].cast(), "Invalid value of size_links_per_element_ "); @@ -535,10 +520,20 @@ class Index { } } + + // set num_deleted + appr_alg->num_deleted_ = 0; + bool has_deletions = d["has_deletions"].cast(); + if (has_deletions) + { + for (size_t i = 0; i < appr_alg->cur_element_count; i++) { + if(appr_alg->isMarkedDeleted(i)) + appr_alg->num_deleted_ += 1; + } + } } py::object knnQuery_return_numpy(py::object input, size_t k = 1, int num_threads = -1) { - py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); auto buffer = items.request(); hnswlib::labeltype *data_numpy_l; @@ -561,7 +556,6 @@ class Index { features = buffer.shape[0]; } - // avoid using threads when the number of searches is small: if(rows<=num_threads*4){ @@ -609,7 +603,6 @@ class Index { } ); } - } py::capsule free_when_done_l(data_numpy_l, [](void *f) { delete[] f; @@ -618,7 +611,6 @@ class Index { delete[] f; }); - return py::make_tuple( py::array_t( {rows, k}, // shape @@ -638,6 +630,10 @@ class Index { appr_alg->markDelete(label); } + void unmarkDeleted(size_t label) { + appr_alg->unmarkDelete(label); + } + void resizeIndex(size_t new_size) { appr_alg->resizeIndex(new_size); } @@ -649,11 +645,9 @@ class Index { size_t getCurrentCount() const { return appr_alg->cur_element_count; } - }; - PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); @@ -672,6 +666,7 @@ PYBIND11_PLUGIN(hnswlib) { .def("save_index", &Index::saveIndex, py::arg("path_to_index")) .def("load_index", &Index::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0) .def("mark_deleted", &Index::markDeleted, py::arg("label")) + .def("unmark_deleted", &Index::unmarkDeleted, py::arg("label")) .def("resize_index", &Index::resizeIndex, py::arg("new_size")) .def("get_max_elements", &Index::getMaxElements) .def("get_current_count", &Index::getCurrentCount) diff --git a/python_bindings/tests/bindings_test_labels.py b/python_bindings/tests/bindings_test_labels.py index 668d7694..87259f1f 100644 --- a/python_bindings/tests/bindings_test_labels.py +++ b/python_bindings/tests/bindings_test_labels.py @@ -94,23 +94,23 @@ def testRandomSelf(self): self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0) # Delete data1 - labels1, _ = p.knn_query(data1, k=1) + labels1_deleted, _ = p.knn_query(data1, k=1) - for l in labels1: + for l in labels1_deleted: p.mark_deleted(l[0]) labels2, _ = p.knn_query(data2, k=1) - items=p.get_items(labels2) + items = p.get_items(labels2) diff_with_gt_labels = np.mean(np.abs(data2-items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-3) # console labels1_after, _ = p.knn_query(data1, k=1) for la in labels1_after: - for lb in labels1: + for lb in labels1_deleted: if la[0] == lb[0]: self.assertTrue(False) print("All the data in data1 are removed") - # checking saving/loading index with elements marked as deleted + # Checking saving/loading index with elements marked as deleted del_index_path = "with_deleted.bin" p.save_index(del_index_path) p = hnswlib.Index(space='l2', dim=dim) @@ -119,9 +119,16 @@ def testRandomSelf(self): labels1_after, _ = p.knn_query(data1, k=1) for la in labels1_after: - for lb in labels1: + for lb in labels1_deleted: if la[0] == lb[0]: self.assertTrue(False) + # Unmark deleted data + for l in labels1_deleted: + p.unmark_deleted(l[0]) + labels_restored, _ = p.knn_query(data1, k=1) + self.assertAlmostEqual(np.mean(labels_restored.reshape(-1) == np.arange(len(data1))), 1.0, 3) + print("All the data in data1 are restored") + os.remove(index_path) os.remove(del_index_path) From cca297ae59d8c3ca73fdb15fd90ccfa86d095701 Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Wed, 24 Nov 2021 23:01:33 -0800 Subject: [PATCH 24/25] update documents for the release --- ALGO_PARAMS.md | 2 ++ README.md | 34 ++++++++++++++++++++++------------ setup.py | 2 +- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/ALGO_PARAMS.md b/ALGO_PARAMS.md index 2b058563..b0a6b7ad 100644 --- a/ALGO_PARAMS.md +++ b/ALGO_PARAMS.md @@ -9,6 +9,8 @@ The ```knn_query``` function returns two numpy arrays, containing labels and dis elements for the queries. Note that in case the algorithm is not be able to find ```k``` neighbors to all of the queries, (this can be due to problems with graph or ```k```>size of the dataset) an exception is thrown. +An example of tuning the parameters can be found in [TESTING_RECALL.md](TESTING_RECALL.md) + ## Construction parameters: * ```M``` - the number of bi-directional links created for every new element during construction. Reasonable range for ```M``` is 2-100. Higher ```M``` work better on datasets with high intrinsic dimensionality and/or high recall, while low ```M``` work diff --git a/README.md b/README.md index 4ca5584d..a23a2e57 100644 --- a/README.md +++ b/README.md @@ -3,21 +3,20 @@ Header-only C++ HNSW implementation with python bindings. **NEWS:** -* **Hnswlib is now 0.5.2**. Bugfixes - thanks [@marekhanus](https://github.com/marekhanus) for fixing the missing arguments, adding support for python 3.8, 3.9 in Travis, improving python wrapper and fixing typos/code style; [@apoorv-sharma](https://github.com/apoorv-sharma) for fixing the bug int the insertion/deletion logic; [@shengjun1985](https://github.com/shengjun1985) for simplifying the memory reallocation logic; [@TakaakiFuruse](https://github.com/TakaakiFuruse) for improved description of `add_items`; [@psobot ](https://github.com/psobot) for improving error handling; [@ShuAiii](https://github.com/ShuAiii) for reporting the bug in the python interface +**version 0.6** +* Thanks to ([@dyashuni](https://github.com/dyashuni)) hnswlib now uses github actions for CI, there is a search speedup in some scenarios with deletions. `unmark_deleted(label)` is now also a part of the python interface (note now it throws an exception for double deletions). +* Thanks to ([@slice4e](https://github.com/slice4e)) we now support AVX512; thanks to ([@LTLA](https://github.com/LTLA)) the cmake interface for the lib is now updated. +* Thanks to ([@alonre24](https://github.com/alonre24)) we now have a python bindings for brute-force (and examples for recall tuning: [TESTING_RECALL.md](TESTING_RECALL.md). +* Thanks to ([@dorosy-yeong](https://github.com/dorosy-yeong)) there is a bug fixed in the handling large quantities of deleted elements and large K. -* **Hnswlib is now 0.5.0**. Added support for pickling indices, support for PEP-517 and PEP-518 building, small speedups, bug and documentation fixes. Many thanks to [@dbespalov](https://github.com/dbespalov), [@dyashuni](https://github.com/dyashuni), [@groodt](https://github.com/groodt),[@uestc-lfs](https://github.com/uestc-lfs), [@vinnitu](https://github.com/vinnitu), [@fabiencastan](https://github.com/fabiencastan), [@JinHai-CN](https://github.com/JinHai-CN), [@js1010](https://github.com/js1010)! - -* **Thanks to Apoorv Sharma [@apoorv-sharma](https://github.com/apoorv-sharma), hnswlib now supports true element updates (the interface remained the same, but when you the performance/memory should not degrade as you update the element embeddings).** - -* **Thanks to Dmitry [@2ooom](https://github.com/2ooom), hnswlib got a boost in performance for vector dimensions that are not multiple of 4** + -* **Thanks to Louis Abraham ([@louisabraham](https://github.com/louisabraham)) hnswlib can now be installed via pip!** -Highlights: -1) Lightweight, header-only, no dependencies other than C++ 11. -2) Interfaces for C++, python and R (https://github.com/jlmelville/rcpphnsw). +### Highlights: +1) Lightweight, header-only, no dependencies other than C++ 11 +2) Interfaces for C++, Java, Python and R (https://github.com/jlmelville/rcpphnsw). 3) Has full support for incremental index construction. Has support for element deletions -(currently, without actual freeing of the memory). +(by marking them in index). Index is picklable. 4) Can work with custom user defined distances (C++). 5) Significantly less memory footprint and faster build time compared to current nmslib's implementation. @@ -53,7 +52,9 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib. - If index already has the elements with the same labels, their features will be updated. Note that update procedure is slower than insertion of a new element, but more memory- and query-efficient. * Thread-safe with other `add_items` calls, but not with `knn_query`. -* `mark_deleted(label)` - marks the element as deleted, so it will be omitted from search results. +* `mark_deleted(label)` - marks the element as deleted, so it will be omitted from search results. Throws an exception if it is already deleted. +* +* `unmark_deleted(label)` - unmarks the element as deleted, so it will be not be omitted from search results. * `resize_index(new_size)` - changes the maximum capacity of the index. Not thread safe with `add_items` and `knn_query`. @@ -225,6 +226,15 @@ pip install . or you can install via pip: `pip install hnswlib` + +### For developers + +When making changes please run tests (and please add a test to `python_bindings/tests` in case there is new functionality): +```bash +python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py +``` + + ### Other implementations * Non-metric space library (nmslib) - main library(python, C++), supports exotic distances: https://github.com/nmslib/nmslib * Faiss library by facebook, uses own HNSW implementation for coarse quantization (python, C++): diff --git a/setup.py b/setup.py index 92a8ee61..90826dea 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ from setuptools import Extension, setup from setuptools.command.build_ext import build_ext -__version__ = '0.5.2' +__version__ = '0.6.0' include_dirs = [ From bcbcb5d5080b5cec6e247650c2f5162aecccc3d9 Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Thu, 9 Dec 2021 13:33:46 -0800 Subject: [PATCH 25/25] add performance test for commits --- examples/git_tester.py | 16 +++++++++++ examples/speedtest.py | 62 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 examples/git_tester.py create mode 100644 examples/speedtest.py diff --git a/examples/git_tester.py b/examples/git_tester.py new file mode 100644 index 00000000..7891ef20 --- /dev/null +++ b/examples/git_tester.py @@ -0,0 +1,16 @@ +from pydriller import Repository +import os +import datetime +os.system("cp examples/speedtest.py examples/speedtest2.py") +for commit in Repository('.', from_tag="v0.5.2").traverse_commits(): + print(commit.hash) + print(commit.msg) + + os.system(f"git checkout {commit.hash}; rm -rf build; ") + os.system("python -m pip install .") + os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 4 -t 1') + os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 64 -t 1') + os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 128 -t 1') + os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 4 -t 24') + os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 128 -t 24') + diff --git a/examples/speedtest.py b/examples/speedtest.py new file mode 100644 index 00000000..cf8e6085 --- /dev/null +++ b/examples/speedtest.py @@ -0,0 +1,62 @@ +import hnswlib +import numpy as np +import os.path +import time +import argparse + +# Use nargs to specify how many arguments an option should take. +ap = argparse.ArgumentParser() +ap.add_argument('-d') +ap.add_argument('-n') +ap.add_argument('-t') +args = ap.parse_args() +dim = int(args.d) +name = args.n +threads=int(args.t) +num_elements = 1000000 * 4//dim + +# Generating sample data +np.random.seed(1) +data = np.float32(np.random.random((num_elements, dim))) + + +index_path=f'speed_index{dim}.bin' +# Declaring index +p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip + +if not os.path.isfile(index_path) : + + p.init_index(max_elements=num_elements, ef_construction=100, M=16) + + # Controlling the recall by setting ef: + # higher ef leads to better accuracy, but slower search + p.set_ef(10) + + # Set number of threads used during batch search/construction + # By default using all available cores + p.set_num_threads(12) + + p.add_items(data) + + # Serializing and deleting the index: + + print("Saving index to '%s'" % index_path) + p.save_index(index_path) +p.set_num_threads(threads) +times=[] +time.sleep(10) +p.set_ef(100) +for _ in range(3): + p.load_index(index_path) + for _ in range(10): + t0=time.time() + labels, distances = p.knn_query(data, k=1) + tt=time.time()-t0 + times.append(tt) + print(f"{tt} seconds") +str_out=f"mean time:{np.mean(times)}, median time:{np.median(times)}, std time {np.std(times)} {name}" +print(str_out) +with open (f"log_{dim}_t{threads}.txt","a") as f: + f.write(str_out+"\n") + f.flush() +