From fa3233876eaabdc6d6dfd9ab1c1a901c3956ba65 Mon Sep 17 00:00:00 2001 From: uestc-lfs Date: Sat, 27 Jun 2020 23:26:11 +0800 Subject: [PATCH 01/58] currObj should be updated as the closest from all candidated. --- hnswlib/hnswalg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 97bdcd18..9aecbe1b 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -406,7 +406,7 @@ namespace hnswlib { top_candidates.pop(); } - tableint next_closest_entry_point = selectedNeighbors[0]; + tableint next_closest_entry_point = selectedNeighbors.back(); { linklistsizeint *ll_cur; From b6b338e661c245dc155b15b87b429c8b5c85f8cc Mon Sep 17 00:00:00 2001 From: uestc-lfs Date: Sun, 28 Jun 2020 00:05:24 +0800 Subject: [PATCH 02/58] 1. Replace the template interface searchKnn with virtual interface 2. add asser.h, or it will not compile --- hnswlib/hnswalg.h | 28 ++++++++++++---------------- hnswlib/hnswlib.h | 5 ++--- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 97bdcd18..2db735d9 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -8,6 +8,7 @@ #include #include +#include namespace hnswlib { typedef unsigned int tableint; @@ -406,7 +407,7 @@ namespace hnswlib { top_candidates.pop(); } - tableint next_closest_entry_point = selectedNeighbors[0]; + tableint next_closest_entry_point = selectedNeighbors.back(); { linklistsizeint *ll_cur; @@ -1156,24 +1157,19 @@ namespace hnswlib { return result; }; - template - std::vector> - searchKnn(const void* query_data, size_t k, Comp comp) { - std::vector> result; - if (cur_element_count == 0) return result; - - auto ret = searchKnn(query_data, k); - - while (!ret.empty()) { - result.push_back(ret.top()); - ret.pop(); + int searchKnn(const void* x, + int k, labeltype* labels, dist_t* dists = nullptr) const override { + if (labels == nullptr) return -1; + auto ret = searchKnn(x, k); + for (int i = k - 1; i >= 0; --i) { + if (dists) + dists[i] = ret.top().first; + labels[i] = ret.top().second; } - - std::sort(result.begin(), result.end(), comp); - - return result; + return 0; } + void checkIntegrity(){ int connections_checked=0; std::vector inbound_connections_num(cur_element_count,0); diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index c26f80b5..6ef54495 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -71,9 +71,8 @@ namespace hnswlib { public: virtual void addPoint(const void *datapoint, labeltype label)=0; virtual std::priority_queue> searchKnn(const void *, size_t) const = 0; - template - std::vector> searchKnn(const void*, size_t, Comp) { - } + virtual int searchKnn(const void* x, + int k, labeltype* labels, dist_t* dists) const = 0; virtual void saveIndex(const std::string &location)=0; virtual ~AlgorithmInterface(){ } From 898718801e91c8f04ef734a6dbecec0909fed3b2 Mon Sep 17 00:00:00 2001 From: uestc-lfs Date: Sun, 28 Jun 2020 00:08:09 +0800 Subject: [PATCH 03/58] minor fix --- hnswlib/hnswalg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 2db735d9..342e4ad5 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -407,7 +407,7 @@ namespace hnswlib { top_candidates.pop(); } - tableint next_closest_entry_point = selectedNeighbors.back(); + tableint next_closest_entry_point = selectedNeighbors[0]; { linklistsizeint *ll_cur; From a35fcb507613cfd930aa572483069d00d5c01d4f Mon Sep 17 00:00:00 2001 From: Jack Wimberley Date: Thu, 9 Jul 2020 09:28:24 -0400 Subject: [PATCH 04/58] Adding cassert include in header to fix compilation error on Ubuntu 18.04 with g++ 7.3.0 --- hnswlib/hnswalg.h | 1 + 1 file changed, 1 insertion(+) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 97bdcd18..3c8f6cdb 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include From 4a4689c82446fa651ac4ad02c3053a13f3d729c2 Mon Sep 17 00:00:00 2001 From: Jack Wimberley Date: Thu, 9 Jul 2020 16:43:59 -0400 Subject: [PATCH 05/58] Small patch to enable compilation with sign_compare and reorder warning flags enabled --- hnswlib/hnswalg.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 97bdcd18..9f9faa72 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -26,7 +26,7 @@ namespace hnswlib { } HierarchicalNSW(SpaceInterface *s, size_t max_elements, size_t M = 16, size_t ef_construction = 200, size_t random_seed = 100) : - link_list_locks_(max_elements), element_levels_(max_elements), link_list_update_locks_(max_update_element_locks) { + link_list_locks_(max_elements), link_list_update_locks_(max_update_element_locks), element_levels_(max_elements) { max_elements_ = max_elements; has_deletions_=false; @@ -868,12 +868,12 @@ namespace hnswlib { // continue; std::priority_queue, std::vector>, CompareByFirst> candidates; - int size = sCand.find(neigh) == sCand.end() ? sCand.size() : sCand.size() - 1; - int elementsToKeep = std::min(int(ef_construction_), size); + size_t size = sCand.find(neigh) == sCand.end() ? sCand.size() : sCand.size() - 1; // sCand guaranteed to have size >= 1 + size_t elementsToKeep = std::min(ef_construction_, size); for (auto&& cand : sCand) { if (cand == neigh) continue; - + dist_t distance = fstdistfunc_(getDataByInternalId(neigh), getDataByInternalId(cand), dist_func_param_); if (candidates.size() < elementsToKeep) { candidates.emplace(distance, cand); @@ -892,7 +892,7 @@ namespace hnswlib { std::unique_lock lock(link_list_locks_[neigh]); linklistsizeint *ll_cur; ll_cur = get_linklist_at_level(neigh, layer); - int candSize = candidates.size(); + size_t candSize = candidates.size(); setListCount(ll_cur, candSize); tableint *data = (tableint *) (ll_cur + 1); for (size_t idx = 0; idx < candSize; idx++) { From 6f2c3fbbd73e89f478abbab6214ab3f8378cb847 Mon Sep 17 00:00:00 2001 From: Fabien Castan Date: Wed, 19 Aug 2020 18:19:09 +0200 Subject: [PATCH 06/58] L2SqrI: add fallback if the dimension is not a multiple of 4 --- hnswlib/space_l2.h | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h index bc00af72..e86e13b0 100644 --- a/hnswlib/space_l2.h +++ b/hnswlib/space_l2.h @@ -204,7 +204,7 @@ namespace hnswlib { }; static int - L2SqrI(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) { + L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) { size_t qty = *((size_t *) qty_ptr); int res = 0; @@ -226,12 +226,23 @@ namespace hnswlib { res += ((*a) - (*b)) * ((*a) - (*b)); a++; b++; + } + return (res); + } + static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) { + size_t qty = *((size_t*)qty_ptr); + int res = 0; + unsigned char* a = (unsigned char*)pVect1; + unsigned char* b = (unsigned char*)pVect2; + for(size_t i = 0; i < qty; i++) + { + res += ((*a) - (*b)) * ((*a) - (*b)); + a++; + b++; } - return (res); - } class L2SpaceI : public SpaceInterface { @@ -241,7 +252,12 @@ namespace hnswlib { size_t dim_; public: L2SpaceI(size_t dim) { - fstdistfunc_ = L2SqrI; + if(dim % 4 == 0) { + fstdistfunc_ = L2SqrI4x; + } + else { + fstdistfunc_ = L2SqrI; + } dim_ = dim; data_size_ = dim * sizeof(unsigned char); } From cb7b398b04828273cc6a3da88e00fe8ef389da92 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 01:27:30 +0000 Subject: [PATCH 07/58] New methods loadIndexFromStream and saveIndexToStream expose de-/serialization logic of HierarchicalNSW class via std::i/ostream. --- hnswlib/hnswalg.h | 63 +++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 7d0eb443..d2c36f0c 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -26,6 +26,10 @@ namespace hnswlib { loadIndex(location, s, max_elements); } + HierarchicalNSW(SpaceInterface *s, std::istream & input, bool nmslib = false, size_t max_elements=0) { + loadIndexFromStream(input, s, max_elements); + } + HierarchicalNSW(SpaceInterface *s, size_t max_elements, size_t M = 16, size_t ef_construction = 200, size_t random_seed = 100) : link_list_locks_(max_elements), link_list_update_locks_(max_update_element_locks), element_levels_(max_elements) { max_elements_ = max_elements; @@ -57,8 +61,6 @@ namespace hnswlib { visited_list_pool_ = new VisitedListPool(1, max_elements); - - //initializations for special treatment of the first node enterpoint_node_ = -1; maxlevel_ = -1; @@ -102,6 +104,8 @@ namespace hnswlib { double mult_, revSize_; int maxlevel_; + std::mutex global; + size_t ef_; VisitedListPool *visited_list_pool_; std::mutex cur_element_count_guard_; @@ -511,8 +515,6 @@ namespace hnswlib { return next_closest_entry_point; } - std::mutex global; - size_t ef_; void setEf(size_t ef) { ef_ = ef; @@ -598,10 +600,7 @@ namespace hnswlib { max_elements_=new_max_elements; } - - void saveIndex(const std::string &location) { - std::ofstream output(location, std::ios::binary); - std::streampos position; + void saveIndexToStream(std::ostream &output) { writeBinaryPOD(output, offsetLevel0_); writeBinaryPOD(output, max_elements_); @@ -626,17 +625,17 @@ namespace hnswlib { if (linkListSize) output.write(linkLists_[i], linkListSize); } - output.close(); - } - - void loadIndex(const std::string &location, SpaceInterface *s, size_t max_elements_i=0) { + } - std::ifstream input(location, std::ios::binary); - - if (!input.is_open()) - throw std::runtime_error("Cannot open file"); + void saveIndex(const std::string &location) { + std::ofstream output(location, std::ios::binary); + std::streampos position; + saveIndexToStream(output); + output.close(); + } + void loadIndexFromStream(std::istream & input, SpaceInterface *s, size_t max_elements_i=0) { // get file size: input.seekg(0,input.end); @@ -663,14 +662,12 @@ namespace hnswlib { readBinaryPOD(input, mult_); readBinaryPOD(input, ef_construction_); - data_size_ = s->get_data_size(); fstdistfunc_ = s->get_dist_func(); dist_func_param_ = s->get_dist_func_param(); auto pos=input.tellg(); - /// Optional - check if index is ok: input.seekg(cur_element_count * size_data_per_element_,input.cur); @@ -696,15 +693,11 @@ namespace hnswlib { input.seekg(pos,input.beg); - data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_); if (data_level0_memory_ == nullptr) throw std::runtime_error("Not enough memory: loadIndex failed to allocate level0"); input.read(data_level0_memory_, cur_element_count * size_data_per_element_); - - - size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); @@ -715,7 +708,6 @@ namespace hnswlib { visited_list_pool_ = new VisitedListPool(1, max_elements); - linkLists_ = (char **) malloc(sizeof(void *) * max_elements); if (linkLists_ == nullptr) throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists"); @@ -746,11 +738,22 @@ namespace hnswlib { has_deletions_=true; } - input.close(); return; } + + + void loadIndex(const std::string &location, SpaceInterface *s, size_t max_elements_i=0) { + std::ifstream input(location, std::ios::binary); + if (!input.is_open()) + throw std::runtime_error("Cannot open file"); + + loadIndexFromStream(input, s, max_elements_i); + input.close(); + return; + } + template std::vector getDataByLabel(labeltype label) { @@ -874,7 +877,7 @@ namespace hnswlib { for (auto&& cand : sCand) { if (cand == neigh) continue; - + dist_t distance = fstdistfunc_(getDataByInternalId(neigh), getDataByInternalId(cand), dist_func_param_); if (candidates.size() < elementsToKeep) { candidates.emplace(distance, cand); @@ -1137,7 +1140,7 @@ namespace hnswlib { } std::priority_queue, std::vector>, CompareByFirst> top_candidates; - if (has_deletions_) { + if (has_deletions_) { top_candidates=searchBaseLayerST( currObj, query_data, std::max(ef_, k)); } @@ -1186,19 +1189,19 @@ namespace hnswlib { std::unordered_set s; for (int j=0; j 0); - assert(data[j] < cur_element_count); + assert(data[j] < cur_element_count); assert (data[j] != i); inbound_connections_num[data[j]]++; s.insert(data[j]); connections_checked++; - + } assert(s.size() == size); } } if(cur_element_count > 1){ int min1=inbound_connections_num[0], max1=inbound_connections_num[0]; - for(int i=0; i < cur_element_count; i++){ + for(int i=0; i < cur_element_count; i++){ assert(inbound_connections_num[i] > 0); min1=std::min(inbound_connections_num[i],min1); max1=std::max(inbound_connections_num[i],max1); @@ -1206,7 +1209,7 @@ namespace hnswlib { std::cout << "Min inbound: " << min1 << ", Max inbound:" << max1 << "\n"; } std::cout << "integrity ok, checked " << connections_checked << " connections\n"; - + } }; From e161db863efba6e39c4f2cd5013a6de9f439fb12 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 01:28:27 +0000 Subject: [PATCH 08/58] Implement __getstate__ and __setstate__ to allow pickling of hnswlib.Index objects; add new properties to Index class: space_name, dim, max_elements, element_count, ef_construction, M, num_threads, ef. Properties num_threads and ef are read-write-able, other parameters are read-only. --- python_bindings/bindings.cpp | 176 +++++++++++++++++++++++------------ 1 file changed, 119 insertions(+), 57 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 1b88ca23..84839cd6 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -13,7 +13,7 @@ namespace py = pybind11; * only handles a subset of functionality (no reductions etc) * Process ids from start (inclusive) to end (EXCLUSIVE) * - * The method is borrowed from nmslib + * The method is borrowed from nmslib */ template inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn) { @@ -74,24 +74,24 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn template class Index { public: - Index(const std::string &space_name, const int dim) : - space_name(space_name), dim(dim) { - normalize=false; - if(space_name=="l2") { - l2space = new hnswlib::L2Space(dim); - } - else if(space_name=="ip") { - l2space = new hnswlib::InnerProductSpace(dim); - } - else if(space_name=="cosine") { - l2space = new hnswlib::InnerProductSpace(dim); - normalize=true; - } - appr_alg = NULL; - ep_added = true; - index_inited = false; - num_threads_default = std::thread::hardware_concurrency(); + Index(const std::string &space_name, const int dim) : + space_name(space_name), dim(dim) { + normalize=false; + if(space_name=="l2") { + l2space = new hnswlib::L2Space(dim); } + else if(space_name=="ip") { + l2space = new hnswlib::InnerProductSpace(dim); + } + else if(space_name=="cosine") { + l2space = new hnswlib::InnerProductSpace(dim); + normalize=true; + } + appr_alg = NULL; + ep_added = true; + index_inited = false; + num_threads_default = std::thread::hardware_concurrency(); + } void init_new_index(const size_t maxElements, const size_t M, const size_t efConstruction, const size_t random_seed) { if (appr_alg) { @@ -103,17 +103,12 @@ class Index { ep_added = false; } + + void set_ef(size_t ef) { appr_alg->ef_ = ef; } - size_t get_ef_construction() { - return appr_alg->ef_construction_; - } - - size_t get_M() { - return appr_alg->M_; - } void set_num_threads(int num_threads) { this->num_threads_default = num_threads; @@ -122,15 +117,28 @@ class Index { void saveIndex(const std::string &path_to_index) { appr_alg->saveIndex(path_to_index); } + void saveIndexToStream(std::ostream & output) const { + appr_alg->saveIndexToStream(output); + } + + void loadIndexFromStream(std::istream & input, size_t max_elements) { + if (appr_alg) { + std::cerr<<"Warning: Calling load_index from istream for an already inited index. Old index is being deallocated." << std::endl; + delete appr_alg; + } + appr_alg = new hnswlib::HierarchicalNSW(l2space, input, false, max_elements); + cur_l = appr_alg->cur_element_count; + } void loadIndex(const std::string &path_to_index, size_t max_elements) { - if (appr_alg) { - std::cerr<<"Warning: Calling load_index for an already inited index. Old index is being deallocated."; - delete appr_alg; - } - appr_alg = new hnswlib::HierarchicalNSW(l2space, path_to_index, false, max_elements); - cur_l = appr_alg->cur_element_count; + if (appr_alg) { + std::cerr<<"Warning: Calling load_index for an already inited index. Old index is being deallocated."; + delete appr_alg; + } + appr_alg = new hnswlib::HierarchicalNSW(l2space, path_to_index, false, max_elements); + cur_l = appr_alg->cur_element_count; } + void normalize_vector(float *data, float *norm_array){ float norm=0.0f; for(int i=0;i norm_array(dim); - if(normalize){ - normalize_vector(vector_data, norm_array.data()); - vector_data = norm_array.data(); - - } - appr_alg->addPoint((void *) vector_data, (size_t) id); - start = 1; - ep_added = true; + int start = 0; + if (!ep_added) { + size_t id = ids.size() ? ids.at(0) : (cur_l); + float *vector_data=(float *) items.data(0); + std::vector norm_array(dim); + if(normalize){ + normalize_vector(vector_data, norm_array.data()); + vector_data = norm_array.data(); } + appr_alg->addPoint((void *) vector_data, (size_t) id); + start = 1; + ep_added = true; + } py::gil_scoped_release l; if(normalize==false) { @@ -214,7 +221,7 @@ class Index { std::vector norm_array(num_threads * dim); ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) { // normalize vector: - size_t start_idx = threadId * dim; + size_t start_idx = threadId * dim; normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); size_t id = ids.size() ? ids.at(row) : (cur_l+row); @@ -370,7 +377,6 @@ class Index { std::string space_name; int dim; - bool index_inited; bool ep_added; bool normalize; @@ -386,31 +392,87 @@ class Index { } }; + + PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); py::class_>(m, "Index") .def(py::init(), py::arg("space"), py::arg("dim")) - .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, - py::arg("ef_construction")=200, py::arg("random_seed")=100) + .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, py::arg("ef_construction")=200, py::arg("random_seed")=100) .def("knn_query", &Index::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1, py::arg("num_threads")=-1) .def("add_items", &Index::addItems, py::arg("data"), py::arg("ids") = py::none(), py::arg("num_threads")=-1) .def("get_items", &Index::getDataReturnList, py::arg("ids") = py::none()) .def("get_ids_list", &Index::getIdsList) .def("set_ef", &Index::set_ef, py::arg("ef")) - .def("get_ef_construction", &Index::get_ef_construction) - .def("get_M", &Index::get_M) .def("set_num_threads", &Index::set_num_threads, py::arg("num_threads")) .def("save_index", &Index::saveIndex, py::arg("path_to_index")) .def("load_index", &Index::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0) .def("mark_deleted", &Index::markDeleted, py::arg("label")) .def("resize_index", &Index::resizeIndex, py::arg("new_size")) - .def("get_max_elements", &Index::getMaxElements) - .def("get_current_count", &Index::getCurrentCount) - .def("__repr__", - [](const Index &a) { - return ""; - } - ); + .def_readonly("space_name", &Index::space_name) + .def_readonly("dim", &Index::dim) + .def_readwrite("num_threads", &Index::num_threads_default) + .def_property("ef", + [](const Index & index) { + return index.index_inited ? index.appr_alg->ef_ : 10; + }, + [](Index & index, const size_t ef_) { + if (index.index_inited) + index.appr_alg->ef_ = ef_; + else + throw std::runtime_error("must call init_index prior to setting ef parameter"); + }) + .def_property_readonly("max_elements", [](const Index & index) { + return index.index_inited ? index.appr_alg->max_elements_ : 0; + }) + .def_property_readonly("element_count", [](const Index & index) { + return index.index_inited ? index.appr_alg->cur_element_count : 0; + }) + .def_property_readonly("ef_construction", [](const Index & index) { + return index.index_inited ? index.appr_alg->ef_construction_ : 0; + }) + .def_property_readonly("M", [](const Index & index) { + return index.index_inited ? index.appr_alg->M_ : 0; + }) + .def("__getstate__", [](const Index & index) { // __getstate__ + std::stringstream output(std::stringstream::out|std::stringstream::binary); + + + if (index.index_inited) + index.saveIndexToStream(output); + + /* Return a tuple that fully encodes the state of the object */ + return py::make_tuple(index.space_name, index.dim, + index.index_inited, index.ep_added, + index.normalize, index.num_threads_default, + py::bytes(output.str()), + index.index_inited == false ? 10 : index.appr_alg->ef_, + index.index_inited == false ? 0 : index.appr_alg->max_elements_, + index.index_inited == false ? 0 : index.appr_alg->cur_element_count + ); + }) + .def("__setstate__", [](Index & index, py::tuple t) { // __setstate__ + if (t.size() != 10) + throw std::runtime_error("Invalid state!"); + + /* Invoke Index constructor (need to use in-place version) */ + new (&index) Index(t[0].cast(), t[1].cast()); + index.index_inited=t[2].cast(); + index.ep_added=t[3].cast(); + index.normalize=t[4].cast(); + index.num_threads_default=t[5].cast(); + + if (index.index_inited){ + std::stringstream input(t[6].cast(), std::stringstream::in|std::stringstream::binary); + index.loadIndexFromStream(input, t[8].cast()); // use max_elements from state + index.appr_alg->ef_=(t[7].cast()); + } + + }) + .def("__repr__", [](const Index &a) { + return ""; + }); + return m.ptr(); } From e0eacad7d008a12c5e16d14a1a320b99352fca08 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 01:30:34 +0000 Subject: [PATCH 09/58] Verify knn_query results match before/after pickling hnswlib.Index objects; use brute-force knn search to verify knn_query gives recall of (almost) 100% --- python_bindings/tests/bindings_test_pickle.py | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 python_bindings/tests/bindings_test_pickle.py diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py new file mode 100644 index 00000000..0d57d946 --- /dev/null +++ b/python_bindings/tests/bindings_test_pickle.py @@ -0,0 +1,144 @@ +import unittest + +import numpy as np + + +def get_dist(metric, pt1, pt2): + if metric == 'l2': + return np.sum((pt1-pt2)**2) + elif metric == 'ip': + return 1. - np.sum(np.multiply(pt1,pt2)) + elif metric == 'cosine': + return 1. - np.sum(np.multiply(pt1,pt2)) / (np.sum(pt1**2) * np.sum(pt2**2))**.5 + +def brute_force_distances(metric, items, query_items, k): + dists=np.zeros((query_items.shape[0], items.shape[0])) + for ii in range(items.shape[0]): + for jj in range(query_items.shape[0]): + dists[jj,ii]=get_dist(metric, items[ii, :], query_items[jj, :]) + + labels = np.argsort(dists, axis=1) + dists = np.sort(dists, axis=1) + + + return labels[:,:k], dists[:,:k] + + +class PickleSelfTestCase(unittest.TestCase): + + def check_ann_results(self, metric, items, query_items, k, ann_l, ann_d, err_thresh=0, total_thresh=0, dists_thresh=0): + brute_l, brute_d = brute_force_distances(metric, items, query_items, k) + err_total = 0 + for jj in range(query_items.shape[0]): + err = np.sum(np.isin(brute_l[jj, :], ann_l[jj, :], invert=True)) + if err > 0: + print(f"Warning: {err} labels are missing from ann results (k={k}, err_thresh={err_thresh})") + + if err > err_thresh: + err_total += 1 + + self.assertLessEqual( err_total, total_thresh, f"Error: knn_query returned incorrect labels for {err_total} items (k={k})") + + wrong_dists=np.sum(((brute_d- ann_d)**2.)>1e-3) + if wrong_dists > 0: + dists_count=brute_d.shape[0]*brute_d.shape[1] + print(f"Warning: {wrong_dists} ann distance values are different from brute-force values (total # of values={dists_count}, dists_thresh={dists_thresh})") + + self.assertLessEqual( wrong_dists, dists_thresh, msg=f"Error: {wrong_dists} ann distance values are different from brute-force values") + + def testPickle(self): + import hnswlib + import pickle + + ef_construction = 725 + M = 64 + ef = 725 + + num_elements = 5000 + num_test_elements = 100 + + num_threads = 4 + k = 15 + + label_err_thresh=5 ### max number of missing labels allowed per test item + item_err_thresh=5 ### max number of items allowed with incorrect labels + + dists_err_thresh=50 ### for two matrices, d1 and d2, dists_err_thresh controls max + ### number of value pairs that are allowed to be different in d1 and d2 + ### i.e., number of values that are (d1-d2)**2>1e-3 + + for space,dim in [('ip', 48), ('l2', 152), ('cosine', 512)]: + + # Generating sample data + data = np.float32(np.random.random((num_elements, dim))) + test_data = np.float32(np.random.random((num_test_elements, dim))) + + # Declaring index + p = hnswlib.Index(space=space, dim=dim) # possible options are l2, cosine or ip + print(f"Running pickle tests for {p}") + + p.num_threads=num_threads # by default using all available cores + + p0=pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index + p.init_index(max_elements = num_elements, ef_construction = ef_construction, M = M) + p0.init_index(max_elements = num_elements, ef_construction = ef_construction, M = M) + + p.ef=ef ### Note: ef parameter can be set only after calling p.init_index, + p0.ef=ef ### so we have to set p0.ef + + p1=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + + ### add items to ann index p,p0,p1 + p.add_items(data) + p1.add_items(data) + p0.add_items(data) + + p2=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + + self.assertTrue(np.allclose(p.get_items(), p0.get_items()), "items for p and p0 must be same") + self.assertTrue(np.allclose(p0.get_items(), p1.get_items()), "items for p0 and p1 must be same") + self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same") + + ### Test if returned distances are same + l, d = p.knn_query(test_data, k=k) + l0, d0 = p0.knn_query(test_data, k=k) + l1, d1 = p1.knn_query(test_data, k=k) + l2, d2 = p2.knn_query(test_data, k=k) + + self.assertLessEqual(np.sum(((d-d0)**2.)>1e-3), dists_err_thresh, msg=f"knn distances returned by p and p0 must match") + self.assertLessEqual(np.sum(((d0-d1)**2.)>1e-3), dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match") + self.assertLessEqual(np.sum(((d1-d2)**2.)>1e-3), dists_err_thresh, msg=f"knn distances returned by p1 and p2 must match") + + ### check if ann results match brute-force search + ### allow for 2 labels to be missing from ann results + self.check_ann_results(space, data, test_data, k, l, d, + err_thresh = label_err_thresh, + total_thresh = item_err_thresh, + dists_thresh = dists_err_thresh) + + self.check_ann_results(space, data, test_data, k, l2, d2, + err_thresh=label_err_thresh, + total_thresh=item_err_thresh, + dists_thresh=dists_err_thresh) + + ### Check ef parameter value + self.assertEqual(p.ef, ef, "incorrect value of p.ef") + self.assertEqual(p0.ef, ef, "incorrect value of p0.ef") + self.assertEqual(p2.ef, ef, "incorrect value of p2.ef") + self.assertEqual(p1.ef, ef, "incorrect value of p1.ef") + + ### Check M parameter value + self.assertEqual(p.M, M, "incorrect value of p.M") + self.assertEqual(p0.M, M, "incorrect value of p0.M") + self.assertEqual(p1.M, M, "incorrect value of p1.M") + self.assertEqual(p2.M, M, "incorrect value of p2.M") + + ### Check ef_construction parameter value + self.assertEqual(p.ef_construction, ef_construction, "incorrect value of p.ef_construction") + self.assertEqual(p0.ef_construction, ef_construction, "incorrect value of p0.ef_construction") + self.assertEqual(p1.ef_construction, ef_construction, "incorrect value of p1.ef_construction") + self.assertEqual(p2.ef_construction, ef_construction, "incorrect value of p2.ef_construction") + + +if __name__ == "__main__": + unittest.main() From ec4f4b1a89ca9043e7e90de598589fd97aff9be9 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 03:04:59 +0000 Subject: [PATCH 10/58] add documeentation --- README.md | 19 ++++++++++++++++++- python_bindings/bindings.cpp | 2 +- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 559c5dfd..b2950508 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,17 @@ Index methods: * `get_current_count()` - returns the current number of element stored in the index - +Index properties: + +* `space` - name of the space (can be one of 'l2', 'ip', 'cosine'). This property is read-only. +* `dim` - dimensionality of the space. This property is read-only. +* `M` - parameter that defines the maximum number of outgoing connections in the graph. This property is read-only. +* `ef_construction` - parameter that controls speed/accuracy trade-off during the index construction. This property is read-only. +* `ef` - parameter controlling query time/accuracy trade-off. This property supports read and write operations. Note: setting property `p.ef` prior to index initialization with `p.init_index(...)` will raise an error. +* `num_threads` - number of threads used in `add_items` or `knn_query` by default. This property supports read and write operations. Calling `p.set_num_threads(3)` is equivalent to `p.num_threads=3`. +* `max_elements` - current capacity of the index (equivalent to `p.get_max_elements()`). This property is read-only. +* `element_count` - number of items in the index (equivalent to `p.get_current_count()`). This property is read-only. + @@ -84,6 +94,7 @@ Index methods: ```python import hnswlib import numpy as np +import pickle dim = 128 num_elements = 10000 @@ -106,6 +117,12 @@ p.set_ef(50) # ef should always be > k # Query dataset, k - number of closest elements (returns 2 numpy arrays) labels, distances = p.knn_query(data, k = 1) + +# Index objects support pickling: +p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p + +print(f"Index parameters: space={p_copy.space}, dim={p_copy.dim}, M={p_copy.M}, ef_construction={p_copy.ef_construction} ") +print(f" ef={p_copy.ef}, element_count={p_copy.element_count}, max_elements={p_copy.max_elements}") ``` An example with updates after serialization/deserialization: diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 84839cd6..589f4024 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -410,7 +410,7 @@ PYBIND11_PLUGIN(hnswlib) { .def("load_index", &Index::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0) .def("mark_deleted", &Index::markDeleted, py::arg("label")) .def("resize_index", &Index::resizeIndex, py::arg("new_size")) - .def_readonly("space_name", &Index::space_name) + .def_readonly("space", &Index::space_name) .def_readonly("dim", &Index::dim) .def_readwrite("num_threads", &Index::num_threads_default) .def_property("ef", From a3646cc6e50dca51ffd5e85aafb4e776ee3185e1 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 15:10:52 +0000 Subject: [PATCH 11/58] clean-up readme --- README.md | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index b2950508..1c4b8a3a 100644 --- a/README.md +++ b/README.md @@ -76,18 +76,27 @@ Index methods: * `get_current_count()` - returns the current number of element stored in the index -Index properties: +Read-only properties of Index class: -* `space` - name of the space (can be one of 'l2', 'ip', 'cosine'). This property is read-only. -* `dim` - dimensionality of the space. This property is read-only. -* `M` - parameter that defines the maximum number of outgoing connections in the graph. This property is read-only. -* `ef_construction` - parameter that controls speed/accuracy trade-off during the index construction. This property is read-only. -* `ef` - parameter controlling query time/accuracy trade-off. This property supports read and write operations. Note: setting property `p.ef` prior to index initialization with `p.init_index(...)` will raise an error. -* `num_threads` - number of threads used in `add_items` or `knn_query` by default. This property supports read and write operations. Calling `p.set_num_threads(3)` is equivalent to `p.num_threads=3`. -* `max_elements` - current capacity of the index (equivalent to `p.get_max_elements()`). This property is read-only. -* `element_count` - number of items in the index (equivalent to `p.get_current_count()`). This property is read-only. +* `space` - name of the space (can be one of "l2", "ip", or "cosine"). - +* `dim` - dimensionality of the space. + +* `M` - parameter that defines the maximum number of outgoing connections in the graph. + +* `ef_construction` - parameter that controls speed/accuracy trade-off during the index construction. + +* `max_elements` - current capacity of the index (equivalent to `p.get_max_elements()`). + +* `element_count` - number of items in the index (equivalent to `p.get_current_count()`). + +Properties of Index class that support reading and writing: + +* `ef` - parameter controlling query time/accuracy trade-off. Note that setting property `p.ef` prior to index initialization with `p.init_index(...)` will raise an error. + +* `num_threads` - default number of threads to use in `add_items` or `knn_query`. Note that calling `p.set_num_threads(3)` is equivalent to `p.num_threads=3`. + + #### Python bindings examples @@ -121,8 +130,12 @@ labels, distances = p.knn_query(data, k = 1) # Index objects support pickling: p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p -print(f"Index parameters: space={p_copy.space}, dim={p_copy.dim}, M={p_copy.M}, ef_construction={p_copy.ef_construction} ") -print(f" ef={p_copy.ef}, element_count={p_copy.element_count}, max_elements={p_copy.max_elements}") +### Index parameters are exposed as class properties: +print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}") +print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}") +print(f"Index size and capacity: element_count={p_copy.element_count}, max_elements={p_copy.max_elements}") +print(f"Search parameter: ef={p_copy.ef}") + ``` An example with updates after serialization/deserialization: From a1ba4e50c818bd3b0b8d63a0f3fe82eb08e6e281 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 15:15:15 +0000 Subject: [PATCH 12/58] clean-up readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1c4b8a3a..aef43e09 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ p.set_ef(50) # ef should always be > k labels, distances = p.knn_query(data, k = 1) # Index objects support pickling: -p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p +p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle round-trip ### Index parameters are exposed as class properties: print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}") From cf3846c150435c1d0d28c8a12ba165fd53c40030 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 15:22:58 +0000 Subject: [PATCH 13/58] clean-up readme --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index aef43e09..8722222d 100644 --- a/README.md +++ b/README.md @@ -86,15 +86,15 @@ Read-only properties of Index class: * `ef_construction` - parameter that controls speed/accuracy trade-off during the index construction. -* `max_elements` - current capacity of the index (equivalent to `p.get_max_elements()`). +* `max_elements` - current capacity of the index. Equivalent to `p.get_max_elements()`. -* `element_count` - number of items in the index (equivalent to `p.get_current_count()`). +* `element_count` - number of items in the index. Equivalent to `p.get_current_count()`. Properties of Index class that support reading and writing: * `ef` - parameter controlling query time/accuracy trade-off. Note that setting property `p.ef` prior to index initialization with `p.init_index(...)` will raise an error. -* `num_threads` - default number of threads to use in `add_items` or `knn_query`. Note that calling `p.set_num_threads(3)` is equivalent to `p.num_threads=3`. +* `num_threads` - default number of threads to use in `add_items` or `knn_query`. Note that calling `p.set_num_threads(3)` is equivalent to setting `p.num_threads=3`. @@ -133,8 +133,8 @@ p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle ### Index parameters are exposed as class properties: print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}") print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}") -print(f"Index size and capacity: element_count={p_copy.element_count}, max_elements={p_copy.max_elements}") -print(f"Search parameter: ef={p_copy.ef}") +print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}") +print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}") ``` From 27471cd617cb4bebb4098dcd43890ef65419adb4 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 16:03:05 +0000 Subject: [PATCH 14/58] clean-up readme --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8722222d..28accb84 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib. #### Short API description * `hnswlib.Index(space, dim)` creates a non-initialized index an HNSW in space `space` with integer dimension `dim`. -Index methods: +`hnswlib.Index` methods: * `init_index(max_elements, ef_construction = 200, M = 16, random_seed = 100)` initializes the index from with no elements. * `max_elements` defines the maximum number of elements that can be stored in the structure(can be increased/shrunk). * `ef_construction` defines a construction time/accuracy trade-off (see [ALGO_PARAMS.md](ALGO_PARAMS.md)). @@ -76,7 +76,7 @@ Index methods: * `get_current_count()` - returns the current number of element stored in the index -Read-only properties of Index class: +Read-only properties of `hnswlib.Index` class: * `space` - name of the space (can be one of "l2", "ip", or "cosine"). @@ -90,7 +90,7 @@ Read-only properties of Index class: * `element_count` - number of items in the index. Equivalent to `p.get_current_count()`. -Properties of Index class that support reading and writing: +Properties of `hnswlib.Index` that support reading and writing: * `ef` - parameter controlling query time/accuracy trade-off. Note that setting property `p.ef` prior to index initialization with `p.init_index(...)` will raise an error. From 4220956d9935847b4488673db5d204a5cf03386a Mon Sep 17 00:00:00 2001 From: Dmitriy Bespalov Date: Mon, 12 Oct 2020 13:00:35 -0400 Subject: [PATCH 15/58] Update bindings_test_pickle.py use 200 test items --- python_bindings/tests/bindings_test_pickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py index 0d57d946..3d011726 100644 --- a/python_bindings/tests/bindings_test_pickle.py +++ b/python_bindings/tests/bindings_test_pickle.py @@ -55,7 +55,7 @@ def testPickle(self): ef = 725 num_elements = 5000 - num_test_elements = 100 + num_test_elements = 200 num_threads = 4 k = 15 From 72b650190e6c9faf536ca202e168eade4958c7c8 Mon Sep 17 00:00:00 2001 From: "Bespalov, Dmitriy (CORP)" Date: Fri, 23 Oct 2020 02:31:26 -0400 Subject: [PATCH 16/58] Revert "New methods loadIndexFromStream and saveIndexToStream expose de-/serialization logic of HierarchicalNSW class via std::i/ostream." This reverts commit cb7b398b04828273cc6a3da88e00fe8ef389da92. --- hnswlib/hnswalg.h | 63 ++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index d2c36f0c..7d0eb443 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -26,10 +26,6 @@ namespace hnswlib { loadIndex(location, s, max_elements); } - HierarchicalNSW(SpaceInterface *s, std::istream & input, bool nmslib = false, size_t max_elements=0) { - loadIndexFromStream(input, s, max_elements); - } - HierarchicalNSW(SpaceInterface *s, size_t max_elements, size_t M = 16, size_t ef_construction = 200, size_t random_seed = 100) : link_list_locks_(max_elements), link_list_update_locks_(max_update_element_locks), element_levels_(max_elements) { max_elements_ = max_elements; @@ -61,6 +57,8 @@ namespace hnswlib { visited_list_pool_ = new VisitedListPool(1, max_elements); + + //initializations for special treatment of the first node enterpoint_node_ = -1; maxlevel_ = -1; @@ -104,8 +102,6 @@ namespace hnswlib { double mult_, revSize_; int maxlevel_; - std::mutex global; - size_t ef_; VisitedListPool *visited_list_pool_; std::mutex cur_element_count_guard_; @@ -515,6 +511,8 @@ namespace hnswlib { return next_closest_entry_point; } + std::mutex global; + size_t ef_; void setEf(size_t ef) { ef_ = ef; @@ -600,7 +598,10 @@ namespace hnswlib { max_elements_=new_max_elements; } - void saveIndexToStream(std::ostream &output) { + + void saveIndex(const std::string &location) { + std::ofstream output(location, std::ios::binary); + std::streampos position; writeBinaryPOD(output, offsetLevel0_); writeBinaryPOD(output, max_elements_); @@ -625,17 +626,17 @@ namespace hnswlib { if (linkListSize) output.write(linkLists_[i], linkListSize); } - - } - - void saveIndex(const std::string &location) { - std::ofstream output(location, std::ios::binary); - std::streampos position; - saveIndexToStream(output); output.close(); } - void loadIndexFromStream(std::istream & input, SpaceInterface *s, size_t max_elements_i=0) { + void loadIndex(const std::string &location, SpaceInterface *s, size_t max_elements_i=0) { + + + std::ifstream input(location, std::ios::binary); + + if (!input.is_open()) + throw std::runtime_error("Cannot open file"); + // get file size: input.seekg(0,input.end); @@ -662,12 +663,14 @@ namespace hnswlib { readBinaryPOD(input, mult_); readBinaryPOD(input, ef_construction_); + data_size_ = s->get_data_size(); fstdistfunc_ = s->get_dist_func(); dist_func_param_ = s->get_dist_func_param(); auto pos=input.tellg(); + /// Optional - check if index is ok: input.seekg(cur_element_count * size_data_per_element_,input.cur); @@ -693,11 +696,15 @@ namespace hnswlib { input.seekg(pos,input.beg); + data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_); if (data_level0_memory_ == nullptr) throw std::runtime_error("Not enough memory: loadIndex failed to allocate level0"); input.read(data_level0_memory_, cur_element_count * size_data_per_element_); + + + size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); @@ -708,6 +715,7 @@ namespace hnswlib { visited_list_pool_ = new VisitedListPool(1, max_elements); + linkLists_ = (char **) malloc(sizeof(void *) * max_elements); if (linkLists_ == nullptr) throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists"); @@ -738,19 +746,8 @@ namespace hnswlib { has_deletions_=true; } - - return; - } - - - - void loadIndex(const std::string &location, SpaceInterface *s, size_t max_elements_i=0) { - std::ifstream input(location, std::ios::binary); - if (!input.is_open()) - throw std::runtime_error("Cannot open file"); - - loadIndexFromStream(input, s, max_elements_i); input.close(); + return; } @@ -877,7 +874,7 @@ namespace hnswlib { for (auto&& cand : sCand) { if (cand == neigh) continue; - + dist_t distance = fstdistfunc_(getDataByInternalId(neigh), getDataByInternalId(cand), dist_func_param_); if (candidates.size() < elementsToKeep) { candidates.emplace(distance, cand); @@ -1140,7 +1137,7 @@ namespace hnswlib { } std::priority_queue, std::vector>, CompareByFirst> top_candidates; - if (has_deletions_) { + if (has_deletions_) { top_candidates=searchBaseLayerST( currObj, query_data, std::max(ef_, k)); } @@ -1189,19 +1186,19 @@ namespace hnswlib { std::unordered_set s; for (int j=0; j 0); - assert(data[j] < cur_element_count); + assert(data[j] < cur_element_count); assert (data[j] != i); inbound_connections_num[data[j]]++; s.insert(data[j]); connections_checked++; - + } assert(s.size() == size); } } if(cur_element_count > 1){ int min1=inbound_connections_num[0], max1=inbound_connections_num[0]; - for(int i=0; i < cur_element_count; i++){ + for(int i=0; i < cur_element_count; i++){ assert(inbound_connections_num[i] > 0); min1=std::min(inbound_connections_num[i],min1); max1=std::max(inbound_connections_num[i],max1); @@ -1209,7 +1206,7 @@ namespace hnswlib { std::cout << "Min inbound: " << min1 << ", Max inbound:" << max1 << "\n"; } std::cout << "integrity ok, checked " << connections_checked << " connections\n"; - + } }; From 3a62b41e11dade2eb846f72a8b0b80ae9217db15 Mon Sep 17 00:00:00 2001 From: "Bespalov, Dmitriy (CORP)" Date: Fri, 23 Oct 2020 13:27:32 -0400 Subject: [PATCH 17/58] use python's buffer protocol to avoid making copies of ann data (state of Index object) when calling Python <-> C++ --- python_bindings/bindings.cpp | 441 +++++++++++++++++++++++++++++------ 1 file changed, 373 insertions(+), 68 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 589f4024..33d71879 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -5,6 +5,8 @@ #include "hnswlib/hnswlib.h" #include #include +#include +#include namespace py = pybind11; @@ -71,6 +73,44 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn } +// +// std::priority_queue> +// searchKnn(const void *query_data, size_t k) const { +// std::priority_queue> result; +// if (cur_element_count == 0) return result; +// +// tableint currObj = enterpoint_node_; +// dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); +// +// for (int level = maxlevel_; level > 0; level--) { +// bool changed = true; +// while (changed) { +// changed = false; +// unsigned int *data; +// +// data = (unsigned int *) get_linklist(currObj, level); +// int size = getListCount(data); +// metric_hops++; +// metric_distance_computations+=size; +// +// tableint *datal = (tableint *) (data + 1); +// for (int i = 0; i < size; i++) { +// tableint cand = datal[i]; +// if (cand < 0 || cand > max_elements_) +// throw std::runtime_error("cand error"); +// dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); +// +// if (d < curdist) { +// curdist = d; +// currObj = cand; +// changed = true; +// } +// } +// } +// } +// + + template class Index { public: @@ -91,6 +131,27 @@ class Index { ep_added = true; index_inited = false; num_threads_default = std::thread::hardware_concurrency(); + + default_ef=10; + + } + std::string space_name; + int dim; + size_t seed; + size_t default_ef; + + bool index_inited; + bool ep_added; + bool normalize; + int num_threads_default; + hnswlib::labeltype cur_l; + hnswlib::HierarchicalNSW *appr_alg; + hnswlib::SpaceInterface *l2space; + + ~Index() { + delete l2space; + if (appr_alg) + delete appr_alg; } void init_new_index(const size_t maxElements, const size_t M, const size_t efConstruction, const size_t random_seed) { @@ -101,11 +162,14 @@ class Index { appr_alg = new hnswlib::HierarchicalNSW(l2space, maxElements, M, efConstruction, random_seed); index_inited = true; ep_added = false; + appr_alg->ef_ = default_ef; + seed=random_seed; } - void set_ef(size_t ef) { + default_ef=ef; + if (appr_alg) appr_alg->ef_ = ef; } @@ -117,18 +181,6 @@ class Index { void saveIndex(const std::string &path_to_index) { appr_alg->saveIndex(path_to_index); } - void saveIndexToStream(std::ostream & output) const { - appr_alg->saveIndexToStream(output); - } - - void loadIndexFromStream(std::istream & input, size_t max_elements) { - if (appr_alg) { - std::cerr<<"Warning: Calling load_index from istream for an already inited index. Old index is being deallocated." << std::endl; - delete appr_alg; - } - appr_alg = new hnswlib::HierarchicalNSW(l2space, input, false, max_elements); - cur_l = appr_alg->cur_element_count; - } void loadIndex(const std::string &path_to_index, size_t max_elements) { if (appr_alg) { @@ -261,6 +313,183 @@ class Index { return ids; } + inline void assert_true(bool expr, const std::string & msg) { + if (expr == false) + throw std::runtime_error("assert failed: "+msg); + return; + } + + + py::tuple getAnnData() const { + + unsigned int level0_npy_size = appr_alg->cur_element_count * appr_alg->size_data_per_element_; + unsigned int link_npy_size = appr_alg->cur_element_count * appr_alg->maxlevel_ * appr_alg->size_links_per_element_; + unsigned int link_npy_stride = appr_alg->maxlevel_ * appr_alg->size_links_per_element_; + + char* data_level0_npy = (char *) malloc(level0_npy_size); + char* link_list_npy = (char *) malloc(link_npy_size); + + memset(data_level0_npy, 0, level0_npy_size); + memset(link_list_npy, 0, link_npy_size); + + memcpy(data_level0_npy, appr_alg->data_level0_memory_, level0_npy_size); + + + for (size_t i = 0; i < appr_alg->cur_element_count; i++){ + unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + if (linkListSize){ + memcpy(link_list_npy+(link_npy_stride * i), appr_alg->linkLists_[i], linkListSize); + // std::cout << linkListSize << " " << appr_alg->maxlevel_ << " " << appr_alg->element_levels_[i] << " generator: " << appr_alg->level_generator_ << std::endl; + } + } + + py::capsule free_when_done_l0(data_level0_npy, [](void *f) { + delete[] f; + }); + py::capsule free_when_done_ll(link_list_npy, [](void *f) { + delete[] f; + }); + + return py::make_tuple(appr_alg->offsetLevel0_, + appr_alg->max_elements_, + appr_alg->cur_element_count, + appr_alg->size_data_per_element_, + appr_alg->label_offset_, + appr_alg->offsetData_, + appr_alg->maxlevel_, + appr_alg->enterpoint_node_, + appr_alg->maxM_, + appr_alg->maxM0_, + appr_alg->M_, + appr_alg->mult_, + appr_alg->ef_construction_, + appr_alg->ef_, + appr_alg->has_deletions_, + appr_alg->size_links_per_element_, + appr_alg->label_lookup_, + appr_alg->element_levels_, + py::array_t( + {level0_npy_size}, // shape + {sizeof(char)}, // C-style contiguous strides for double + data_level0_npy, // the data pointer + free_when_done_l0), + py::array_t( + {link_npy_size}, // shape + {sizeof(char)}, // C-style contiguous strides for double + link_list_npy, // the data pointer + free_when_done_ll) + ); + + } + + + py::tuple getIndexParams() const { + return py::make_tuple(py::make_tuple(space_name, dim, index_inited, ep_added, normalize, num_threads_default, seed, default_ef), + index_inited == true ? getAnnData() : py::make_tuple()); + + } + + + static Index * createFromParams(const py::tuple t) { + py::tuple index_params=t[0].cast(); + py::tuple ann_params=t[1].cast(); + + auto space_name_=index_params[0].cast(); + auto dim_=index_params[1].cast(); + auto index_inited_=index_params[2].cast(); + + Index *new_index = new Index(index_params[0].cast(), index_params[1].cast()); + + new_index->seed = index_params[6].cast(); + + + if (index_inited_){ + //// hnswlib::HierarchicalNSW(l2space, maxElements, M, efConstruction, random_seed); + new_index->appr_alg = new hnswlib::HierarchicalNSW(new_index->l2space, ann_params[1].cast(), ann_params[10].cast(), ann_params[12].cast(), new_index->seed); + new_index->cur_l = ann_params[2].cast(); + } + + new_index->index_inited = index_inited_; + new_index->ep_added=index_params[3].cast(); + new_index->num_threads_default=index_params[5].cast(); + new_index->default_ef=index_params[7].cast(); + + if (index_inited_) + new_index->setAnnData(ann_params); + + + return new_index; + } + + void setAnnData(const py::tuple t) { + assert_true(appr_alg->offsetLevel0_ == t[0].cast(), "Invalid value of offsetLevel0_ "); + assert_true(appr_alg->max_elements_ == t[1].cast(), "Invalid value of max_elements_ "); + + appr_alg->cur_element_count = t[2].cast(); + + assert_true(appr_alg->size_data_per_element_ == t[3].cast(), "Invalid value of size_data_per_element_ "); + assert_true(appr_alg->label_offset_ == t[4].cast(), "Invalid value of label_offset_ "); + assert_true(appr_alg->offsetData_ == t[5].cast(), "Invalid value of offsetData_ "); + + appr_alg->maxlevel_ = t[6].cast(); + appr_alg->enterpoint_node_ = t[7].cast(); + + assert_true(appr_alg->maxM_ == t[8].cast(), "Invalid value of maxM_ "); + assert_true(appr_alg->maxM0_ == t[9].cast(), "Invalid value of maxM0_ "); + assert_true(appr_alg->M_ == t[10].cast(), "Invalid value of M_ "); + assert_true(appr_alg->mult_ == t[11].cast(), "Invalid value of mult_ "); + assert_true(appr_alg->ef_construction_ == t[12].cast(), "Invalid value of ef_construction_ "); + + appr_alg->ef_ = t[13].cast(); + appr_alg->has_deletions_=t[14].cast(); + assert_true(appr_alg->size_links_per_element_ == t[15].cast(), "Invalid value of size_links_per_element_ "); + + auto label_lookup_dict = t[16].cast(); + auto element_levels_list = t[17].cast(); + auto data_level0_npy = t[18].cast>(); + auto link_list_npy = t[19].cast>(); + + for (auto el: label_lookup_dict){ + appr_alg->label_lookup_.insert( + std::make_pair( + el.first.cast(), + el.second.cast())); + } + + + int idx = 0; + for (auto el : element_levels_list){ + appr_alg->element_levels_[idx]=el.cast(); + idx++; + } + + + memcpy(appr_alg->data_level0_memory_, data_level0_npy.data(), data_level0_npy.nbytes()); + + for (size_t i = 0; i < appr_alg->max_elements_; i++) { + unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + if (linkListSize == 0) { + appr_alg->linkLists_[i] = nullptr; + } else { + appr_alg->linkLists_[i] = (char *) malloc(linkListSize); + if (appr_alg->linkLists_[i] == nullptr) + throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklist"); + + memcpy(appr_alg->linkLists_[i], (link_list_npy.data()+(appr_alg->maxlevel_ * appr_alg->size_links_per_element_ * i)), linkListSize); + + } + } + + + // TODO: use global lock for de-/serialization + // std::unique_lock templock(global); + // int maxlevelcopy = maxlevel_; + // if (curlevel <= maxlevelcopy) + // templock.unlock(); + + } + + py::object knnQuery_return_numpy(py::object input, size_t k = 1, int num_threads = -1) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); @@ -317,7 +546,7 @@ class Index { float *data= (float *) items.data(row); size_t start_idx = threadId * dim; - normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); + normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); std::priority_queue> result = appr_alg->searchKnn( (void *) (norm_array.data()+start_idx), k); @@ -374,22 +603,6 @@ class Index { return appr_alg->cur_element_count; } - std::string space_name; - int dim; - - bool index_inited; - bool ep_added; - bool normalize; - int num_threads_default; - hnswlib::labeltype cur_l; - hnswlib::HierarchicalNSW *appr_alg; - hnswlib::SpaceInterface *l2space; - - ~Index() { - delete l2space; - if (appr_alg) - delete appr_alg; - } }; @@ -397,7 +610,9 @@ class Index { PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); + // py::class_, std::shared_ptr >>(m, "Index") py::class_>(m, "Index") + .def(py::init(&Index::createFromParams), py::arg("params")) //createFromParams(const py::tuple t) .def(py::init(), py::arg("space"), py::arg("dim")) .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, py::arg("ef_construction")=200, py::arg("random_seed")=100) .def("knn_query", &Index::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1, py::arg("num_threads")=-1) @@ -410,18 +625,23 @@ PYBIND11_PLUGIN(hnswlib) { .def("load_index", &Index::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0) .def("mark_deleted", &Index::markDeleted, py::arg("label")) .def("resize_index", &Index::resizeIndex, py::arg("new_size")) - .def_readonly("space", &Index::space_name) + .def_readonly("space_name", &Index::space_name) .def_readonly("dim", &Index::dim) .def_readwrite("num_threads", &Index::num_threads_default) .def_property("ef", [](const Index & index) { - return index.index_inited ? index.appr_alg->ef_ : 10; + return index.index_inited ? index.appr_alg->ef_ : index.default_ef; }, [](Index & index, const size_t ef_) { - if (index.index_inited) + // index.set_ef(ef_); + index.default_ef=ef_; + if (index.appr_alg) index.appr_alg->ef_ = ef_; - else - throw std::runtime_error("must call init_index prior to setting ef parameter"); + + // if (index.index_inited) + // index.appr_alg->ef_ = ef_; + // else + // throw std::runtime_error("must call init_index prior to setting ef parameter"); }) .def_property_readonly("max_elements", [](const Index & index) { return index.index_inited ? index.appr_alg->max_elements_ : 0; @@ -435,41 +655,126 @@ PYBIND11_PLUGIN(hnswlib) { .def_property_readonly("M", [](const Index & index) { return index.index_inited ? index.appr_alg->M_ : 0; }) - .def("__getstate__", [](const Index & index) { // __getstate__ - std::stringstream output(std::stringstream::out|std::stringstream::binary); - - - if (index.index_inited) - index.saveIndexToStream(output); - - /* Return a tuple that fully encodes the state of the object */ - return py::make_tuple(index.space_name, index.dim, - index.index_inited, index.ep_added, - index.normalize, index.num_threads_default, - py::bytes(output.str()), - index.index_inited == false ? 10 : index.appr_alg->ef_, - index.index_inited == false ? 0 : index.appr_alg->max_elements_, - index.index_inited == false ? 0 : index.appr_alg->cur_element_count - ); - }) - .def("__setstate__", [](Index & index, py::tuple t) { // __setstate__ - if (t.size() != 10) - throw std::runtime_error("Invalid state!"); - - /* Invoke Index constructor (need to use in-place version) */ - new (&index) Index(t[0].cast(), t[1].cast()); - index.index_inited=t[2].cast(); - index.ep_added=t[3].cast(); - index.normalize=t[4].cast(); - index.num_threads_default=t[5].cast(); - - if (index.index_inited){ - std::stringstream input(t[6].cast(), std::stringstream::in|std::stringstream::binary); - index.loadIndexFromStream(input, t[8].cast()); // use max_elements from state - index.appr_alg->ef_=(t[7].cast()); - } + .def(py::pickle( + [](const Index &ind) { // __getstate__ + /* Return a tuple that fully encodes the state of the object */ + return ind.getIndexParams(); + }, + [](py::tuple t) { // __setstate__ + if (t.size() != 2) + throw std::runtime_error("Invalid state!"); + return Index::createFromParams(t); + } + )) + + .def("check_integrity", [](const Index & index) { + index.appr_alg->checkIntegrity(); + std::cout<< index.default_ef << " " << index.appr_alg->ef_ << std::endl; + return index.appr_alg->ef_; + // return index.getIndexParams(); + // return index.appr_alg->element_levels_; + + // std::stringstream output(std::stringstream::out|std::stringstream::binary); + // + // .def("get_params", &Index::getIndexParams) + // .def("set_params", &Index::setIndexParams, py::arg("t"))// [](Index & index, py::tuple t) { + // + // if (index.index_inited) + // index.saveIndexToStream(output); + // + // /* Return a tuple that fully encodes the state of the object */ + // return py::make_tuple(index.space_name, index.dim, + // index.index_inited, index.ep_added, + // index.normalize, index.num_threads_default, + // py::bytes(output.str()), + // index.index_inited == false ? 10 : index.appr_alg->ef_, + // index.index_inited == false ? 0 : index.appr_alg->max_elements_, + // index.index_inited == false ? 0 : index.appr_alg->cur_element_count + // ); }) + + + // .def(py::pickle( + // [](const Index & index) { // __getstate__ + // /* Return a tuple that fully encodes the state of the object */ + // return index.getIndexParams(); + // }, + // [](Index & index, py::tuple t) { // __setstate__ + // if (t.size() != 2) + // throw std::runtime_error("Invalid state!"); + // + // /* Invoke Index constructor (need to use in-place version) */ + // // py::tuple index_params = t[0].cast(); + // // Index new_index(index_params[0].cast(), index_params[1].cast()); + // index.setIndexParams(t); + // return index; + // + // /* Create a new C++ instance */ + // // Pickleable p(t[0].cast()); + // + // /* Assign any additional state */ + // // p.setExtra(t[1].cast()); + // + // // return p; + // } + // )) + + // .def("__getstate__", &Index::getIndexParams) // __getstate__ + // .def("__setstate__", &Index::setIndexParams) // __setstate__ + // .def("__setstate__", [](Index & index, py::tuple t) { // __setstate__ + // py::tuple index_params = t[0].cast(); + // new (&index) Index(index_params[0].cast(), index_params[1].cast()); + // index.setIndexParams(t); + // return index; + // }) + // .def("__getstate__", [](const Index & index) { // __getstate__ + // return index.getIndexParams(); + // + // // std::stringstream output(std::stringstream::out|std::stringstream::binary); + // // + // // .def("get_params", &Index::getIndexParams) + // // .def("set_params", &Index::setIndexParams, py::arg("t"))// [](Index & index, py::tuple t) { + // // + // // if (index.index_inited) + // // index.saveIndexToStream(output); + // // + // // /* Return a tuple that fully encodes the state of the object */ + // // return py::make_tuple(index.space_name, index.dim, + // // index.index_inited, index.ep_added, + // // index.normalize, index.num_threads_default, + // // py::bytes(output.str()), + // // index.index_inited == false ? 10 : index.appr_alg->ef_, + // // index.index_inited == false ? 0 : index.appr_alg->max_elements_, + // // index.index_inited == false ? 0 : index.appr_alg->cur_element_count + // // ); + // }) + // .def("set_state", [](Index & index, py::tuple t) { // __setstate__ + // index.setIndexParams(t); + // }) + // + // .def("__setstate__", [](Index & index, py::tuple t) { // __setstate__ + // // delete &index; + // /* Invoke Index constructor (need to use in-place version) */ + // // py::tuple index_params = t[0].cast(); + // // new (&index) Index(index_params[0].cast(), index_params[1].cast()); + // index.setIndexParams(t); + // // if (t.size() != 10) + // // throw std::runtime_error("Invalid state!"); + // // + // + // // index.index_inited=t[2].cast(); + // // index.ep_added=t[3].cast(); + // // index.normalize=t[4].cast(); + // // index.num_threads_default=t[5].cast(); + // // + // // if (index.index_inited){ + // // std::stringstream input(t[6].cast(), std::stringstream::in|std::stringstream::binary); + // // index.loadIndexFromStream(input, t[8].cast()); // use max_elements from state + // // index.appr_alg->ef_=(t[7].cast()); + // // } + // + // }) .def("__repr__", [](const Index &a) { return ""; }); From fe6d2faaa734b57344240f607ef3a3c78e7731b7 Mon Sep 17 00:00:00 2001 From: "Bespalov, Dmitriy (CORP)" Date: Fri, 23 Oct 2020 13:29:40 -0400 Subject: [PATCH 18/58] replace tab characters with spaces --- hnswlib/hnswalg.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 7d0eb443..3e74d856 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -874,7 +874,7 @@ namespace hnswlib { for (auto&& cand : sCand) { if (cand == neigh) continue; - + dist_t distance = fstdistfunc_(getDataByInternalId(neigh), getDataByInternalId(cand), dist_func_param_); if (candidates.size() < elementsToKeep) { candidates.emplace(distance, cand); @@ -1137,7 +1137,7 @@ namespace hnswlib { } std::priority_queue, std::vector>, CompareByFirst> top_candidates; - if (has_deletions_) { + if (has_deletions_) { top_candidates=searchBaseLayerST( currObj, query_data, std::max(ef_, k)); } @@ -1186,19 +1186,19 @@ namespace hnswlib { std::unordered_set s; for (int j=0; j 0); - assert(data[j] < cur_element_count); + assert(data[j] < cur_element_count); assert (data[j] != i); inbound_connections_num[data[j]]++; s.insert(data[j]); connections_checked++; - + } assert(s.size() == size); } } if(cur_element_count > 1){ int min1=inbound_connections_num[0], max1=inbound_connections_num[0]; - for(int i=0; i < cur_element_count; i++){ + for(int i=0; i < cur_element_count; i++){ assert(inbound_connections_num[i] > 0); min1=std::min(inbound_connections_num[i],min1); max1=std::max(inbound_connections_num[i],max1); @@ -1206,7 +1206,7 @@ namespace hnswlib { std::cout << "Min inbound: " << min1 << ", Max inbound:" << max1 << "\n"; } std::cout << "integrity ok, checked " << connections_checked << " connections\n"; - + } }; From c9fb60dde52649cfd143c8c5a72e6a09ddb32625 Mon Sep 17 00:00:00 2001 From: "Bespalov, Dmitriy (CORP)" Date: Fri, 23 Oct 2020 13:30:51 -0400 Subject: [PATCH 19/58] test each space (ip/cosine/l2) as a separate unittest --- python_bindings/tests/bindings_test_pickle.py | 241 ++++++++++-------- 1 file changed, 128 insertions(+), 113 deletions(-) diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py index 3d011726..a6b74a9d 100644 --- a/python_bindings/tests/bindings_test_pickle.py +++ b/python_bindings/tests/bindings_test_pickle.py @@ -1,6 +1,8 @@ import unittest import numpy as np +import hnswlib +import pickle def get_dist(metric, pt1, pt2): @@ -24,120 +26,133 @@ def brute_force_distances(metric, items, query_items, k): return labels[:,:k], dists[:,:k] -class PickleSelfTestCase(unittest.TestCase): +def check_ann_results(self, metric, items, query_items, k, ann_l, ann_d, err_thresh=0, total_thresh=0, dists_thresh=0): + brute_l, brute_d = brute_force_distances(metric, items, query_items, k) + err_total = 0 + for jj in range(query_items.shape[0]): + err = np.sum(np.isin(brute_l[jj, :], ann_l[jj, :], invert=True)) + if err > 0: + print(f"Warning: {err} labels are missing from ann results (k={k}, err_thresh={err_thresh})") - def check_ann_results(self, metric, items, query_items, k, ann_l, ann_d, err_thresh=0, total_thresh=0, dists_thresh=0): - brute_l, brute_d = brute_force_distances(metric, items, query_items, k) - err_total = 0 - for jj in range(query_items.shape[0]): - err = np.sum(np.isin(brute_l[jj, :], ann_l[jj, :], invert=True)) - if err > 0: - print(f"Warning: {err} labels are missing from ann results (k={k}, err_thresh={err_thresh})") - - if err > err_thresh: - err_total += 1 - - self.assertLessEqual( err_total, total_thresh, f"Error: knn_query returned incorrect labels for {err_total} items (k={k})") - - wrong_dists=np.sum(((brute_d- ann_d)**2.)>1e-3) - if wrong_dists > 0: - dists_count=brute_d.shape[0]*brute_d.shape[1] - print(f"Warning: {wrong_dists} ann distance values are different from brute-force values (total # of values={dists_count}, dists_thresh={dists_thresh})") - - self.assertLessEqual( wrong_dists, dists_thresh, msg=f"Error: {wrong_dists} ann distance values are different from brute-force values") - - def testPickle(self): - import hnswlib - import pickle - - ef_construction = 725 - M = 64 - ef = 725 - - num_elements = 5000 - num_test_elements = 200 - - num_threads = 4 - k = 15 - - label_err_thresh=5 ### max number of missing labels allowed per test item - item_err_thresh=5 ### max number of items allowed with incorrect labels - - dists_err_thresh=50 ### for two matrices, d1 and d2, dists_err_thresh controls max - ### number of value pairs that are allowed to be different in d1 and d2 - ### i.e., number of values that are (d1-d2)**2>1e-3 - - for space,dim in [('ip', 48), ('l2', 152), ('cosine', 512)]: - - # Generating sample data - data = np.float32(np.random.random((num_elements, dim))) - test_data = np.float32(np.random.random((num_test_elements, dim))) - - # Declaring index - p = hnswlib.Index(space=space, dim=dim) # possible options are l2, cosine or ip - print(f"Running pickle tests for {p}") - - p.num_threads=num_threads # by default using all available cores - - p0=pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index - p.init_index(max_elements = num_elements, ef_construction = ef_construction, M = M) - p0.init_index(max_elements = num_elements, ef_construction = ef_construction, M = M) - - p.ef=ef ### Note: ef parameter can be set only after calling p.init_index, - p0.ef=ef ### so we have to set p0.ef - - p1=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items - - ### add items to ann index p,p0,p1 - p.add_items(data) - p1.add_items(data) - p0.add_items(data) - - p2=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items - - self.assertTrue(np.allclose(p.get_items(), p0.get_items()), "items for p and p0 must be same") - self.assertTrue(np.allclose(p0.get_items(), p1.get_items()), "items for p0 and p1 must be same") - self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same") - - ### Test if returned distances are same - l, d = p.knn_query(test_data, k=k) - l0, d0 = p0.knn_query(test_data, k=k) - l1, d1 = p1.knn_query(test_data, k=k) - l2, d2 = p2.knn_query(test_data, k=k) - - self.assertLessEqual(np.sum(((d-d0)**2.)>1e-3), dists_err_thresh, msg=f"knn distances returned by p and p0 must match") - self.assertLessEqual(np.sum(((d0-d1)**2.)>1e-3), dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match") - self.assertLessEqual(np.sum(((d1-d2)**2.)>1e-3), dists_err_thresh, msg=f"knn distances returned by p1 and p2 must match") - - ### check if ann results match brute-force search - ### allow for 2 labels to be missing from ann results - self.check_ann_results(space, data, test_data, k, l, d, - err_thresh = label_err_thresh, - total_thresh = item_err_thresh, - dists_thresh = dists_err_thresh) - - self.check_ann_results(space, data, test_data, k, l2, d2, - err_thresh=label_err_thresh, - total_thresh=item_err_thresh, - dists_thresh=dists_err_thresh) - - ### Check ef parameter value - self.assertEqual(p.ef, ef, "incorrect value of p.ef") - self.assertEqual(p0.ef, ef, "incorrect value of p0.ef") - self.assertEqual(p2.ef, ef, "incorrect value of p2.ef") - self.assertEqual(p1.ef, ef, "incorrect value of p1.ef") - - ### Check M parameter value - self.assertEqual(p.M, M, "incorrect value of p.M") - self.assertEqual(p0.M, M, "incorrect value of p0.M") - self.assertEqual(p1.M, M, "incorrect value of p1.M") - self.assertEqual(p2.M, M, "incorrect value of p2.M") - - ### Check ef_construction parameter value - self.assertEqual(p.ef_construction, ef_construction, "incorrect value of p.ef_construction") - self.assertEqual(p0.ef_construction, ef_construction, "incorrect value of p0.ef_construction") - self.assertEqual(p1.ef_construction, ef_construction, "incorrect value of p1.ef_construction") - self.assertEqual(p2.ef_construction, ef_construction, "incorrect value of p2.ef_construction") + if err > err_thresh: + err_total += 1 + + self.assertLessEqual( err_total, total_thresh, f"Error: knn_query returned incorrect labels for {err_total} items (k={k})") + + wrong_dists=np.sum(((brute_d- ann_d)**2.)>1e-3) + if wrong_dists > 0: + dists_count=brute_d.shape[0]*brute_d.shape[1] + print(f"Warning: {wrong_dists} ann distance values are different from brute-force values (total # of values={dists_count}, dists_thresh={dists_thresh})") + + self.assertLessEqual( wrong_dists, dists_thresh, msg=f"Error: {wrong_dists} ann distance values are different from brute-force values") + +def test_space_main(self, space, dim): + + # Generating sample data + data = np.float32(np.random.random((self.num_elements, dim))) + test_data = np.float32(np.random.random((self.num_test_elements, dim))) + + # Declaring index + p = hnswlib.Index(space=space, dim=dim) # possible options are l2, cosine or ip + print(f"Running pickle tests for {p}") + + p.num_threads=self.num_threads # by default using all available cores + + p0=pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index + p.init_index(max_elements = self.num_elements, ef_construction = self.ef_construction, M = self.M) + p0.init_index(max_elements = self.num_elements, ef_construction = self.ef_construction, M = self.M) + + p.ef=self.ef + p0.ef=self.ef + + p1=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + + ### add items to ann index p,p0,p1 + p.add_items(data) + p1.add_items(data) + p0.add_items(data) + + p2=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + + self.assertTrue(np.allclose(p.get_items(), p0.get_items()), "items for p and p0 must be same") + self.assertTrue(np.allclose(p0.get_items(), p1.get_items()), "items for p0 and p1 must be same") + self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same") + + ### Test if returned distances are same + l, d = p.knn_query(test_data, k=self.k) + l0, d0 = p0.knn_query(test_data, k=self.k) + l1, d1 = p1.knn_query(test_data, k=self.k) + l2, d2 = p2.knn_query(test_data, k=self.k) + + self.assertLessEqual(np.sum(((d-d0)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p and p0 must match") + self.assertLessEqual(np.sum(((d0-d1)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match") + self.assertLessEqual(np.sum(((d1-d2)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p1 and p2 must match") + + ### check if ann results match brute-force search + ### allow for 2 labels to be missing from ann results + check_ann_results(self, space, data, test_data, self.k, l, d, + err_thresh = self.label_err_thresh, + total_thresh = self.item_err_thresh, + dists_thresh = self.dists_err_thresh) + + check_ann_results(self, space, data, test_data, self.k, l2, d2, + err_thresh=self.label_err_thresh, + total_thresh=self.item_err_thresh, + dists_thresh=self.dists_err_thresh) + + ### Check ef parameter value + self.assertEqual(p.ef, self.ef, "incorrect value of p.ef") + self.assertEqual(p0.ef, self.ef, "incorrect value of p0.ef") + self.assertEqual(p2.ef, self.ef, "incorrect value of p2.ef") + self.assertEqual(p1.ef, self.ef, "incorrect value of p1.ef") + + ### Check M parameter value + self.assertEqual(p.M, self.M, "incorrect value of p.M") + self.assertEqual(p0.M, self.M, "incorrect value of p0.M") + self.assertEqual(p1.M, self.M, "incorrect value of p1.M") + self.assertEqual(p2.M, self.M, "incorrect value of p2.M") + + ### Check ef_construction parameter value + self.assertEqual(p.ef_construction, self.ef_construction, "incorrect value of p.ef_construction") + self.assertEqual(p0.ef_construction, self.ef_construction, "incorrect value of p0.ef_construction") + self.assertEqual(p1.ef_construction, self.ef_construction, "incorrect value of p1.ef_construction") + self.assertEqual(p2.ef_construction, self.ef_construction, "incorrect value of p2.ef_construction") + + + +class PickleUnitTests(unittest.TestCase): + + def setUp(self): + + self.ef_construction = 725 + self.M = 64 + self.ef = 725 + + self.num_elements = 5000 + self.num_test_elements = 200 + + self.num_threads = 4 + self.k = 25 + + self.label_err_thresh=5 ### max number of missing labels allowed per test item + self.item_err_thresh=5 ### max number of items allowed with incorrect labels + + self.dists_err_thresh=50 ### for two matrices, d1 and d2, dists_err_thresh controls max + ### number of value pairs that are allowed to be different in d1 and d2 + ### i.e., number of values that are (d1-d2)**2>1e-3 + + def testInnerProductSpace(self): + test_space_main(self, 'ip', 48) + + def testL2Space(self): + test_space_main(self, 'l2', 153) + + def testCosineSpace(self): + test_space_main(self, 'cosine', 512) + + # + # for space,dim in [('ip', 48), ('l2', 152), ('cosine', 512)]: + # test_space_main if __name__ == "__main__": From 3c4510db09677d97afd7b5b00deffc42496e21c4 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Sun, 25 Oct 2020 04:33:08 +0000 Subject: [PATCH 20/58] return array_t pointers --- hnswlib/hnswalg.h | 1 - python_bindings/bindings.cpp | 116 ++--------------------------------- 2 files changed, 4 insertions(+), 113 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 3e74d856..7c2c01c3 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -637,7 +637,6 @@ namespace hnswlib { if (!input.is_open()) throw std::runtime_error("Cannot open file"); - // get file size: input.seekg(0,input.end); std::streampos total_filesize=input.tellg(); diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 33d71879..f066368c 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -273,7 +273,7 @@ class Index { std::vector norm_array(num_threads * dim); ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) { // normalize vector: - size_t start_idx = threadId * dim; + size_t start_idx = threadId * dim; normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); size_t id = ids.size() ? ids.at(row) : (cur_l+row); @@ -339,7 +339,6 @@ class Index { unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; if (linkListSize){ memcpy(link_list_npy+(link_npy_stride * i), appr_alg->linkLists_[i], linkListSize); - // std::cout << linkListSize << " " << appr_alg->maxlevel_ << " " << appr_alg->element_levels_[i] << " generator: " << appr_alg->level_generator_ << std::endl; } } @@ -368,12 +367,12 @@ class Index { appr_alg->size_links_per_element_, appr_alg->label_lookup_, appr_alg->element_levels_, - py::array_t( + new py::array_t( {level0_npy_size}, // shape {sizeof(char)}, // C-style contiguous strides for double data_level0_npy, // the data pointer free_when_done_l0), - py::array_t( + new py::array_t( {link_npy_size}, // shape {sizeof(char)}, // C-style contiguous strides for double link_list_npy, // the data pointer @@ -546,7 +545,7 @@ class Index { float *data= (float *) items.data(row); size_t start_idx = threadId * dim; - normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); + normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); std::priority_queue> result = appr_alg->searchKnn( (void *) (norm_array.data()+start_idx), k); @@ -633,15 +632,9 @@ PYBIND11_PLUGIN(hnswlib) { return index.index_inited ? index.appr_alg->ef_ : index.default_ef; }, [](Index & index, const size_t ef_) { - // index.set_ef(ef_); index.default_ef=ef_; if (index.appr_alg) index.appr_alg->ef_ = ef_; - - // if (index.index_inited) - // index.appr_alg->ef_ = ef_; - // else - // throw std::runtime_error("must call init_index prior to setting ef parameter"); }) .def_property_readonly("max_elements", [](const Index & index) { return index.index_inited ? index.appr_alg->max_elements_ : 0; @@ -672,109 +665,8 @@ PYBIND11_PLUGIN(hnswlib) { index.appr_alg->checkIntegrity(); std::cout<< index.default_ef << " " << index.appr_alg->ef_ << std::endl; return index.appr_alg->ef_; - // return index.getIndexParams(); - // return index.appr_alg->element_levels_; - - // std::stringstream output(std::stringstream::out|std::stringstream::binary); - // - // .def("get_params", &Index::getIndexParams) - // .def("set_params", &Index::setIndexParams, py::arg("t"))// [](Index & index, py::tuple t) { - // - // if (index.index_inited) - // index.saveIndexToStream(output); - // - // /* Return a tuple that fully encodes the state of the object */ - // return py::make_tuple(index.space_name, index.dim, - // index.index_inited, index.ep_added, - // index.normalize, index.num_threads_default, - // py::bytes(output.str()), - // index.index_inited == false ? 10 : index.appr_alg->ef_, - // index.index_inited == false ? 0 : index.appr_alg->max_elements_, - // index.index_inited == false ? 0 : index.appr_alg->cur_element_count - // ); }) - - // .def(py::pickle( - // [](const Index & index) { // __getstate__ - // /* Return a tuple that fully encodes the state of the object */ - // return index.getIndexParams(); - // }, - // [](Index & index, py::tuple t) { // __setstate__ - // if (t.size() != 2) - // throw std::runtime_error("Invalid state!"); - // - // /* Invoke Index constructor (need to use in-place version) */ - // // py::tuple index_params = t[0].cast(); - // // Index new_index(index_params[0].cast(), index_params[1].cast()); - // index.setIndexParams(t); - // return index; - // - // /* Create a new C++ instance */ - // // Pickleable p(t[0].cast()); - // - // /* Assign any additional state */ - // // p.setExtra(t[1].cast()); - // - // // return p; - // } - // )) - - // .def("__getstate__", &Index::getIndexParams) // __getstate__ - // .def("__setstate__", &Index::setIndexParams) // __setstate__ - // .def("__setstate__", [](Index & index, py::tuple t) { // __setstate__ - // py::tuple index_params = t[0].cast(); - // new (&index) Index(index_params[0].cast(), index_params[1].cast()); - // index.setIndexParams(t); - // return index; - // }) - // .def("__getstate__", [](const Index & index) { // __getstate__ - // return index.getIndexParams(); - // - // // std::stringstream output(std::stringstream::out|std::stringstream::binary); - // // - // // .def("get_params", &Index::getIndexParams) - // // .def("set_params", &Index::setIndexParams, py::arg("t"))// [](Index & index, py::tuple t) { - // // - // // if (index.index_inited) - // // index.saveIndexToStream(output); - // // - // // /* Return a tuple that fully encodes the state of the object */ - // // return py::make_tuple(index.space_name, index.dim, - // // index.index_inited, index.ep_added, - // // index.normalize, index.num_threads_default, - // // py::bytes(output.str()), - // // index.index_inited == false ? 10 : index.appr_alg->ef_, - // // index.index_inited == false ? 0 : index.appr_alg->max_elements_, - // // index.index_inited == false ? 0 : index.appr_alg->cur_element_count - // // ); - // }) - // .def("set_state", [](Index & index, py::tuple t) { // __setstate__ - // index.setIndexParams(t); - // }) - // - // .def("__setstate__", [](Index & index, py::tuple t) { // __setstate__ - // // delete &index; - // /* Invoke Index constructor (need to use in-place version) */ - // // py::tuple index_params = t[0].cast(); - // // new (&index) Index(index_params[0].cast(), index_params[1].cast()); - // index.setIndexParams(t); - // // if (t.size() != 10) - // // throw std::runtime_error("Invalid state!"); - // // - // - // // index.index_inited=t[2].cast(); - // // index.ep_added=t[3].cast(); - // // index.normalize=t[4].cast(); - // // index.num_threads_default=t[5].cast(); - // // - // // if (index.index_inited){ - // // std::stringstream input(t[6].cast(), std::stringstream::in|std::stringstream::binary); - // // index.loadIndexFromStream(input, t[8].cast()); // use max_elements from state - // // index.appr_alg->ef_=(t[7].cast()); - // // } - // - // }) .def("__repr__", [](const Index &a) { return ""; }); From 64c51548373011a1f48eadb6d6b28f93edc8cfd1 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Sun, 25 Oct 2020 16:10:17 +0000 Subject: [PATCH 21/58] expose static method of Index class as copy constructor in python --- python_bindings/bindings.cpp | 90 +++++++++++++----------------------- 1 file changed, 33 insertions(+), 57 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index f066368c..cd1d41f7 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -73,43 +73,6 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn } -// -// std::priority_queue> -// searchKnn(const void *query_data, size_t k) const { -// std::priority_queue> result; -// if (cur_element_count == 0) return result; -// -// tableint currObj = enterpoint_node_; -// dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); -// -// for (int level = maxlevel_; level > 0; level--) { -// bool changed = true; -// while (changed) { -// changed = false; -// unsigned int *data; -// -// data = (unsigned int *) get_linklist(currObj, level); -// int size = getListCount(data); -// metric_hops++; -// metric_distance_computations+=size; -// -// tableint *datal = (tableint *) (data + 1); -// for (int i = 0; i < size; i++) { -// tableint cand = datal[i]; -// if (cand < 0 || cand > max_elements_) -// throw std::runtime_error("cand error"); -// dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); -// -// if (d < curdist) { -// curdist = d; -// currObj = cand; -// changed = true; -// } -// } -// } -// } -// - template class Index { @@ -321,10 +284,18 @@ class Index { py::tuple getAnnData() const { + std::unique_lock templock(appr_alg->global); unsigned int level0_npy_size = appr_alg->cur_element_count * appr_alg->size_data_per_element_; - unsigned int link_npy_size = appr_alg->cur_element_count * appr_alg->maxlevel_ * appr_alg->size_links_per_element_; - unsigned int link_npy_stride = appr_alg->maxlevel_ * appr_alg->size_links_per_element_; + unsigned int link_npy_size = 0; + std::vector link_npy_offsets(appr_alg->cur_element_count); + + for (size_t i = 0; i < appr_alg->cur_element_count; i++){ + unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + link_npy_offsets[i]=link_npy_size; + if (linkListSize) + link_npy_size += linkListSize; + } char* data_level0_npy = (char *) malloc(level0_npy_size); char* link_list_npy = (char *) malloc(link_npy_size); @@ -338,7 +309,7 @@ class Index { for (size_t i = 0; i < appr_alg->cur_element_count; i++){ unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; if (linkListSize){ - memcpy(link_list_npy+(link_npy_stride * i), appr_alg->linkLists_[i], linkListSize); + memcpy(link_list_npy+link_npy_offsets[i], appr_alg->linkLists_[i], linkListSize); } } @@ -367,12 +338,12 @@ class Index { appr_alg->size_links_per_element_, appr_alg->label_lookup_, appr_alg->element_levels_, - new py::array_t( + py::array_t( {level0_npy_size}, // shape {sizeof(char)}, // C-style contiguous strides for double data_level0_npy, // the data pointer free_when_done_l0), - new py::array_t( + py::array_t( {link_npy_size}, // shape {sizeof(char)}, // C-style contiguous strides for double link_list_npy, // the data pointer @@ -401,7 +372,6 @@ class Index { new_index->seed = index_params[6].cast(); - if (index_inited_){ //// hnswlib::HierarchicalNSW(l2space, maxElements, M, efConstruction, random_seed); new_index->appr_alg = new hnswlib::HierarchicalNSW(new_index->l2space, ann_params[1].cast(), ann_params[10].cast(), ann_params[12].cast(), new_index->seed); @@ -420,7 +390,14 @@ class Index { return new_index; } + static Index * createFromIndex(const Index & index) { + return createFromParams(index.getIndexParams()); + } + void setAnnData(const py::tuple t) { + + std::unique_lock templock(appr_alg->global); + assert_true(appr_alg->offsetLevel0_ == t[0].cast(), "Invalid value of offsetLevel0_ "); assert_true(appr_alg->max_elements_ == t[1].cast(), "Invalid value of max_elements_ "); @@ -448,6 +425,8 @@ class Index { auto data_level0_npy = t[18].cast>(); auto link_list_npy = t[19].cast>(); + + for (auto el: label_lookup_dict){ appr_alg->label_lookup_.insert( std::make_pair( @@ -462,6 +441,15 @@ class Index { idx++; } + unsigned int link_npy_size = 0; + std::vector link_npy_offsets(appr_alg->cur_element_count); + + for (size_t i = 0; i < appr_alg->cur_element_count; i++){ + unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + link_npy_offsets[i]=link_npy_size; + if (linkListSize) + link_npy_size += linkListSize; + } memcpy(appr_alg->data_level0_memory_, data_level0_npy.data(), data_level0_npy.nbytes()); @@ -474,18 +462,12 @@ class Index { if (appr_alg->linkLists_[i] == nullptr) throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklist"); - memcpy(appr_alg->linkLists_[i], (link_list_npy.data()+(appr_alg->maxlevel_ * appr_alg->size_links_per_element_ * i)), linkListSize); + memcpy(appr_alg->linkLists_[i], link_list_npy.data()+link_npy_offsets[i], linkListSize); } } - // TODO: use global lock for de-/serialization - // std::unique_lock templock(global); - // int maxlevelcopy = maxlevel_; - // if (curlevel <= maxlevelcopy) - // templock.unlock(); - } @@ -609,9 +591,9 @@ class Index { PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); - // py::class_, std::shared_ptr >>(m, "Index") py::class_>(m, "Index") .def(py::init(&Index::createFromParams), py::arg("params")) //createFromParams(const py::tuple t) + .def(py::init(&Index::createFromIndex), py::arg("index")) .def(py::init(), py::arg("space"), py::arg("dim")) .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, py::arg("ef_construction")=200, py::arg("random_seed")=100) .def("knn_query", &Index::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1, py::arg("num_threads")=-1) @@ -661,12 +643,6 @@ PYBIND11_PLUGIN(hnswlib) { } )) - .def("check_integrity", [](const Index & index) { - index.appr_alg->checkIntegrity(); - std::cout<< index.default_ef << " " << index.appr_alg->ef_ << std::endl; - return index.appr_alg->ef_; - }) - .def("__repr__", [](const Index &a) { return ""; }); From 7b445c8aee93b13166ecc6e6a6aa694ee65bdc6c Mon Sep 17 00:00:00 2001 From: dbespalov Date: Sun, 25 Oct 2020 23:41:37 +0000 Subject: [PATCH 22/58] do not waste space when returning serialized appr_alg->linkLists_ --- python_bindings/bindings.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index cd1d41f7..35783868 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -360,7 +360,7 @@ class Index { } - static Index * createFromParams(const py::tuple t) { + static Index * createFromParams(const py::tuple & t) { py::tuple index_params=t[0].cast(); py::tuple ann_params=t[1].cast(); @@ -394,7 +394,7 @@ class Index { return createFromParams(index.getIndexParams()); } - void setAnnData(const py::tuple t) { + void setAnnData(const py::tuple & t) { std::unique_lock templock(appr_alg->global); @@ -471,7 +471,7 @@ class Index { } - py::object knnQuery_return_numpy(py::object input, size_t k = 1, int num_threads = -1) { + py::object knnQuery_return_numpy(py::object & input, size_t k = 1, int num_threads = -1) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); auto buffer = items.request(); @@ -636,7 +636,7 @@ PYBIND11_PLUGIN(hnswlib) { /* Return a tuple that fully encodes the state of the object */ return ind.getIndexParams(); }, - [](py::tuple t) { // __setstate__ + [](py::tuple & t) { // __setstate__ if (t.size() != 2) throw std::runtime_error("Invalid state!"); return Index::createFromParams(t); From c02f1dccbf121cf58cfade0477602a5120014071 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 26 Oct 2020 22:58:10 +0000 Subject: [PATCH 23/58] serialize element_lookup_ and element_level_ as array_t arrays; pass python types by value everywhere --- python_bindings/bindings.cpp | 88 ++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 29 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 35783868..25ac4bd1 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -299,12 +299,27 @@ class Index { char* data_level0_npy = (char *) malloc(level0_npy_size); char* link_list_npy = (char *) malloc(link_npy_size); + int* element_levels_npy = (int *) malloc(appr_alg->element_levels_.size()*sizeof(int)); + + // std::unordered_map label_lookup_; + hnswlib::labeltype* label_lookup_key_npy = (hnswlib::labeltype *) malloc(appr_alg->label_lookup_.size()*sizeof(hnswlib::labeltype)); + hnswlib::tableint* label_lookup_val_npy = (hnswlib::tableint *) malloc(appr_alg->label_lookup_.size()*sizeof(hnswlib::tableint)); + + memset(label_lookup_key_npy, -1, appr_alg->label_lookup_.size()*sizeof(hnswlib::labeltype)); + memset(label_lookup_val_npy, -1, appr_alg->label_lookup_.size()*sizeof(hnswlib::tableint)); + + size_t idx=0; + for ( auto it = appr_alg->label_lookup_.begin(); it != appr_alg->label_lookup_.end(); ++it ){ + label_lookup_key_npy[idx]= it->first; + label_lookup_val_npy[idx]= it->second; + idx++; + } memset(data_level0_npy, 0, level0_npy_size); memset(link_list_npy, 0, link_npy_size); memcpy(data_level0_npy, appr_alg->data_level0_memory_, level0_npy_size); - + memcpy(element_levels_npy, appr_alg->element_levels_.data(), appr_alg->element_levels_.size() * sizeof(int)); for (size_t i = 0; i < appr_alg->cur_element_count; i++){ unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; @@ -316,6 +331,15 @@ class Index { py::capsule free_when_done_l0(data_level0_npy, [](void *f) { delete[] f; }); + py::capsule free_when_done_lvl(element_levels_npy, [](void *f) { + delete[] f; + }); + py::capsule free_when_done_lb(label_lookup_key_npy, [](void *f) { + delete[] f; + }); + py::capsule free_when_done_id(label_lookup_val_npy, [](void *f) { + delete[] f; + }); py::capsule free_when_done_ll(link_list_npy, [](void *f) { delete[] f; }); @@ -336,8 +360,21 @@ class Index { appr_alg->ef_, appr_alg->has_deletions_, appr_alg->size_links_per_element_, - appr_alg->label_lookup_, - appr_alg->element_levels_, + py::array_t( + {appr_alg->label_lookup_.size()}, // shape + {sizeof(hnswlib::labeltype)}, // C-style contiguous strides for double + label_lookup_key_npy, // the data pointer + free_when_done_lb), + py::array_t( + {appr_alg->label_lookup_.size()}, // shape + {sizeof(hnswlib::tableint)}, // C-style contiguous strides for double + label_lookup_val_npy, // the data pointer + free_when_done_id), + py::array_t( + {appr_alg->element_levels_.size()}, // shape + {sizeof(int)}, // C-style contiguous strides for double + element_levels_npy, // the data pointer + free_when_done_lvl), py::array_t( {level0_npy_size}, // shape {sizeof(char)}, // C-style contiguous strides for double @@ -360,7 +397,7 @@ class Index { } - static Index * createFromParams(const py::tuple & t) { + static Index * createFromParams(const py::tuple t) { py::tuple index_params=t[0].cast(); py::tuple ann_params=t[1].cast(); @@ -394,7 +431,7 @@ class Index { return createFromParams(index.getIndexParams()); } - void setAnnData(const py::tuple & t) { + void setAnnData(const py::tuple t) { std::unique_lock templock(appr_alg->global); @@ -420,26 +457,23 @@ class Index { appr_alg->has_deletions_=t[14].cast(); assert_true(appr_alg->size_links_per_element_ == t[15].cast(), "Invalid value of size_links_per_element_ "); - auto label_lookup_dict = t[16].cast(); - auto element_levels_list = t[17].cast(); - auto data_level0_npy = t[18].cast>(); - auto link_list_npy = t[19].cast>(); - - + // std::unordered_map label_lookup_; + auto label_lookup_key_npy = t[16].cast >(); + auto label_lookup_val_npy = t[17].cast >(); + auto element_levels_npy = t[18].cast >(); + auto data_level0_npy = t[19].cast >(); + auto link_list_npy = t[20].cast >(); - for (auto el: label_lookup_dict){ - appr_alg->label_lookup_.insert( - std::make_pair( - el.first.cast(), - el.second.cast())); + for (size_t i = 0; i < appr_alg->cur_element_count; i++){ + if (label_lookup_val_npy.data()[i] < 0){ + throw std::runtime_error("internal id cannot be negative!"); + } + else{ + appr_alg->label_lookup_.insert(std::make_pair(label_lookup_key_npy.data()[i], label_lookup_val_npy.data()[i])); + } } - - int idx = 0; - for (auto el : element_levels_list){ - appr_alg->element_levels_[idx]=el.cast(); - idx++; - } + memcpy(appr_alg->element_levels_.data(), element_levels_npy.data(), element_levels_npy.nbytes()); unsigned int link_npy_size = 0; std::vector link_npy_offsets(appr_alg->cur_element_count); @@ -467,11 +501,9 @@ class Index { } } +} - } - - - py::object knnQuery_return_numpy(py::object & input, size_t k = 1, int num_threads = -1) { + py::object knnQuery_return_numpy(py::object input, size_t k = 1, int num_threads = -1) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); auto buffer = items.request(); @@ -586,8 +618,6 @@ class Index { }; - - PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); @@ -636,7 +666,7 @@ PYBIND11_PLUGIN(hnswlib) { /* Return a tuple that fully encodes the state of the object */ return ind.getIndexParams(); }, - [](py::tuple & t) { // __setstate__ + [](py::tuple t) { // __setstate__ if (t.size() != 2) throw std::runtime_error("Invalid state!"); return Index::createFromParams(t); From 1f251021d6bb581ce1d7e9054de2bd54eadf37df Mon Sep 17 00:00:00 2001 From: dbespalov Date: Tue, 3 Nov 2020 05:47:46 +0000 Subject: [PATCH 24/58] warn that serialization is not thread safe with add_items --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 28accb84..4d74b003 100644 --- a/README.md +++ b/README.md @@ -92,9 +92,9 @@ Read-only properties of `hnswlib.Index` class: Properties of `hnswlib.Index` that support reading and writing: -* `ef` - parameter controlling query time/accuracy trade-off. Note that setting property `p.ef` prior to index initialization with `p.init_index(...)` will raise an error. +* `ef` - parameter controlling query time/accuracy trade-off. -* `num_threads` - default number of threads to use in `add_items` or `knn_query`. Note that calling `p.set_num_threads(3)` is equivalent to setting `p.num_threads=3`. +* `num_threads` - default number of threads to use in `add_items` or `knn_query`. Note that calling `p.set_num_threads(3)` is equivalent to `p.num_threads=3`. @@ -127,7 +127,9 @@ p.set_ef(50) # ef should always be > k # Query dataset, k - number of closest elements (returns 2 numpy arrays) labels, distances = p.knn_query(data, k = 1) -# Index objects support pickling: +# Index objects support pickling +# WARNING: serialization via pickle.dumps(p) or p.__getstate__() is NOT thread-safe with p.add_items method! +# Note: ef parameter is included in serialization; random number generator is initialized with random_seeed on Index load p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle round-trip ### Index parameters are exposed as class properties: From 1165370b76c4ddf2f67e3b0913f079386a34fc40 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Tue, 3 Nov 2020 05:48:44 +0000 Subject: [PATCH 25/58] warn that serialization is not thread safe with add_items; add todo block for serialization of random generators --- python_bindings/bindings.cpp | 41 +++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 25ac4bd1..38dcb18e 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -154,14 +154,14 @@ class Index { cur_l = appr_alg->cur_element_count; } - void normalize_vector(float *data, float *norm_array){ - float norm=0.0f; - for(int i=0;i items(input); @@ -185,7 +185,6 @@ class Index { throw std::runtime_error("wrong dimensionality of the vectors"); // avoid using threads when the number of searches is small: - if(rows<=num_threads*4){ num_threads=1; } @@ -284,6 +283,7 @@ class Index { py::tuple getAnnData() const { + /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ std::unique_lock templock(appr_alg->global); unsigned int level0_npy_size = appr_alg->cur_element_count * appr_alg->size_data_per_element_; @@ -301,7 +301,6 @@ class Index { char* link_list_npy = (char *) malloc(link_npy_size); int* element_levels_npy = (int *) malloc(appr_alg->element_levels_.size()*sizeof(int)); - // std::unordered_map label_lookup_; hnswlib::labeltype* label_lookup_key_npy = (hnswlib::labeltype *) malloc(appr_alg->label_lookup_.size()*sizeof(hnswlib::labeltype)); hnswlib::tableint* label_lookup_val_npy = (hnswlib::tableint *) malloc(appr_alg->label_lookup_.size()*sizeof(hnswlib::tableint)); @@ -315,7 +314,6 @@ class Index { idx++; } - memset(data_level0_npy, 0, level0_npy_size); memset(link_list_npy, 0, link_npy_size); memcpy(data_level0_npy, appr_alg->data_level0_memory_, level0_npy_size); @@ -391,8 +389,12 @@ class Index { py::tuple getIndexParams() const { + /* TODO: serialize state of random generators appr_alg->level_generator_ and appr_alg->update_probability_generator_ */ + /* for full reproducibility / to avoid re-initializing generators inside Index::createFromParams */ return py::make_tuple(py::make_tuple(space_name, dim, index_inited, ep_added, normalize, num_threads_default, seed, default_ef), index_inited == true ? getAnnData() : py::make_tuple()); + /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + } @@ -407,10 +409,11 @@ class Index { Index *new_index = new Index(index_params[0].cast(), index_params[1].cast()); + /* TODO: deserialize state of random generators into new_index->level_generator_ and new_index->update_probability_generator_ */ + /* for full reproducibility / state of generators is serialized inside Index::getIndexParams */ new_index->seed = index_params[6].cast(); if (index_inited_){ - //// hnswlib::HierarchicalNSW(l2space, maxElements, M, efConstruction, random_seed); new_index->appr_alg = new hnswlib::HierarchicalNSW(new_index->l2space, ann_params[1].cast(), ann_params[10].cast(), ann_params[12].cast(), new_index->seed); new_index->cur_l = ann_params[2].cast(); } @@ -428,11 +431,14 @@ class Index { } static Index * createFromIndex(const Index & index) { - return createFromParams(index.getIndexParams()); + /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ + return createFromParams(index.getIndexParams()); } + void setAnnData(const py::tuple t) { - + /* WARNING: Index::setAnnData is not thread-safe with Index::addItems */ + std::unique_lock templock(appr_alg->global); assert_true(appr_alg->offsetLevel0_ == t[0].cast(), "Invalid value of offsetLevel0_ "); @@ -457,7 +463,6 @@ class Index { appr_alg->has_deletions_=t[14].cast(); assert_true(appr_alg->size_links_per_element_ == t[15].cast(), "Invalid value of size_links_per_element_ "); - // std::unordered_map label_lookup_; auto label_lookup_key_npy = t[16].cast >(); auto label_lookup_val_npy = t[17].cast >(); auto element_levels_npy = t[18].cast >(); @@ -622,8 +627,9 @@ PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); py::class_>(m, "Index") - .def(py::init(&Index::createFromParams), py::arg("params")) //createFromParams(const py::tuple t) - .def(py::init(&Index::createFromIndex), py::arg("index")) + .def(py::init(&Index::createFromParams), py::arg("params")) + /* WARNING: Index::createFromIndex is not thread-safe with Index::addItems */ + .def(py::init(&Index::createFromIndex), py::arg("index")) .def(py::init(), py::arg("space"), py::arg("dim")) .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, py::arg("ef_construction")=200, py::arg("random_seed")=100) .def("knn_query", &Index::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1, py::arg("num_threads")=-1) @@ -664,6 +670,7 @@ PYBIND11_PLUGIN(hnswlib) { .def(py::pickle( [](const Index &ind) { // __getstate__ /* Return a tuple that fully encodes the state of the object */ + /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ return ind.getIndexParams(); }, [](py::tuple t) { // __setstate__ From 2c040e67252ad771409eb2a03fa0dead4a0a1d56 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Tue, 3 Nov 2020 05:48:57 +0000 Subject: [PATCH 26/58] remove camel casing --- python_bindings/tests/bindings_test_pickle.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py index a6b74a9d..6c3a826a 100644 --- a/python_bindings/tests/bindings_test_pickle.py +++ b/python_bindings/tests/bindings_test_pickle.py @@ -19,9 +19,8 @@ def brute_force_distances(metric, items, query_items, k): for jj in range(query_items.shape[0]): dists[jj,ii]=get_dist(metric, items[ii, :], query_items[jj, :]) - labels = np.argsort(dists, axis=1) - dists = np.sort(dists, axis=1) - + labels = np.argsort(dists, axis=1) # equivalent, but faster: np.argpartition(dists, range(k), axis=1) + dists = np.sort(dists, axis=1) # equivalent, but faster: np.partition(dists, range(k), axis=1) return labels[:,:k], dists[:,:k] @@ -141,19 +140,14 @@ def setUp(self): ### number of value pairs that are allowed to be different in d1 and d2 ### i.e., number of values that are (d1-d2)**2>1e-3 - def testInnerProductSpace(self): + def test_inner_product_space(self): test_space_main(self, 'ip', 48) - def testL2Space(self): + def test_l2_space(self): test_space_main(self, 'l2', 153) - def testCosineSpace(self): + def test_cosine_space(self): test_space_main(self, 'cosine', 512) - # - # for space,dim in [('ip', 48), ('l2', 152), ('cosine', 512)]: - # test_space_main - - if __name__ == "__main__": unittest.main() From 6298996c76c87f98d94aa2361636a001c31a3cf6 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Fri, 6 Nov 2020 03:39:49 +0000 Subject: [PATCH 27/58] add static const int data member to class Index that stores serialization version; serialization version is returned as the first element of the parameter tuple; serialization version must match when instantiating Index object from parameter tuple --- python_bindings/bindings.cpp | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 38dcb18e..e8a16512 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -74,6 +74,7 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn } + template class Index { public: @@ -96,8 +97,10 @@ class Index { num_threads_default = std::thread::hardware_concurrency(); default_ef=10; - } + + static const int ser_version = 1; // serialization version + std::string space_name; int dim; size_t seed; @@ -282,8 +285,8 @@ class Index { } - py::tuple getAnnData() const { - /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + py::tuple getAnnData() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + std::unique_lock templock(appr_alg->global); unsigned int level0_npy_size = appr_alg->cur_element_count * appr_alg->size_data_per_element_; @@ -391,7 +394,9 @@ class Index { py::tuple getIndexParams() const { /* TODO: serialize state of random generators appr_alg->level_generator_ and appr_alg->update_probability_generator_ */ /* for full reproducibility / to avoid re-initializing generators inside Index::createFromParams */ - return py::make_tuple(py::make_tuple(space_name, dim, index_inited, ep_added, normalize, num_threads_default, seed, default_ef), + + return py::make_tuple(py::int_(Index::ser_version), // serialization version + py::make_tuple(space_name, dim, index_inited, ep_added, normalize, num_threads_default, seed, default_ef), index_inited == true ? getAnnData() : py::make_tuple()); /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ @@ -400,8 +405,12 @@ class Index { static Index * createFromParams(const py::tuple t) { - py::tuple index_params=t[0].cast(); - py::tuple ann_params=t[1].cast(); + + if (py::int_(Index::ser_version) != t[0].cast()) // check serialization version + throw std::runtime_error("Serialization version mismatch!"); + + py::tuple index_params=t[1].cast(); + py::tuple ann_params=t[2].cast(); auto space_name_=index_params[0].cast(); auto dim_=index_params[1].cast(); @@ -623,6 +632,8 @@ class Index { }; + + PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); @@ -674,7 +685,7 @@ PYBIND11_PLUGIN(hnswlib) { return ind.getIndexParams(); }, [](py::tuple t) { // __setstate__ - if (t.size() != 2) + if (t.size() != 3) throw std::runtime_error("Invalid state!"); return Index::createFromParams(t); } From c8276d86982abd5983377f92ef3fab6b41e1bd6b Mon Sep 17 00:00:00 2001 From: dbespalov Date: Fri, 6 Nov 2020 04:32:13 +0000 Subject: [PATCH 28/58] add todo block to convert parameter tuple to dicts --- python_bindings/bindings.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index e8a16512..75a80381 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -393,12 +393,14 @@ class Index { py::tuple getIndexParams() const { /* TODO: serialize state of random generators appr_alg->level_generator_ and appr_alg->update_probability_generator_ */ - /* for full reproducibility / to avoid re-initializing generators inside Index::createFromParams */ + /* for full reproducibility / to avoid re-initializing generators inside Index::createFromParams */ return py::make_tuple(py::int_(Index::ser_version), // serialization version + + /* TODO: convert the following two py::tuple's to py::dict */ py::make_tuple(space_name, dim, index_inited, ep_added, normalize, num_threads_default, seed, default_ef), - index_inited == true ? getAnnData() : py::make_tuple()); - /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + index_inited == true ? getAnnData() : py::make_tuple()); /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + } @@ -409,8 +411,8 @@ class Index { if (py::int_(Index::ser_version) != t[0].cast()) // check serialization version throw std::runtime_error("Serialization version mismatch!"); - py::tuple index_params=t[1].cast(); - py::tuple ann_params=t[2].cast(); + py::tuple index_params=t[1].cast(); /* TODO: convert this py::tuple to py::dict */ + py::tuple ann_params=t[2].cast(); /* TODO: convert this py::tuple to py::dict */ auto space_name_=index_params[0].cast(); auto dim_=index_params[1].cast(); From 345f71da372a9b416bec0b7777d30aeaa5877e26 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Fri, 6 Nov 2020 04:41:49 +0000 Subject: [PATCH 29/58] add todo block to convert parameter tuple to dicts --- python_bindings/bindings.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 75a80381..d9396247 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -285,7 +285,7 @@ class Index { } - py::tuple getAnnData() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + py::tuple getAnnData() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ std::unique_lock templock(appr_alg->global); @@ -411,8 +411,8 @@ class Index { if (py::int_(Index::ser_version) != t[0].cast()) // check serialization version throw std::runtime_error("Serialization version mismatch!"); - py::tuple index_params=t[1].cast(); /* TODO: convert this py::tuple to py::dict */ - py::tuple ann_params=t[2].cast(); /* TODO: convert this py::tuple to py::dict */ + py::tuple index_params=t[1].cast(); /* TODO: convert index_params from py::tuple to py::dict */ + py::tuple ann_params=t[2].cast(); /* TODO: convert ann_params from py::tuple to py::dict */ auto space_name_=index_params[0].cast(); auto dim_=index_params[1].cast(); @@ -421,7 +421,7 @@ class Index { Index *new_index = new Index(index_params[0].cast(), index_params[1].cast()); /* TODO: deserialize state of random generators into new_index->level_generator_ and new_index->update_probability_generator_ */ - /* for full reproducibility / state of generators is serialized inside Index::getIndexParams */ + /* for full reproducibility / state of generators is serialized inside Index::getIndexParams */ new_index->seed = index_params[6].cast(); if (index_inited_){ @@ -442,13 +442,13 @@ class Index { } static Index * createFromIndex(const Index & index) { - /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ + /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ return createFromParams(index.getIndexParams()); } void setAnnData(const py::tuple t) { - /* WARNING: Index::setAnnData is not thread-safe with Index::addItems */ + /* WARNING: Index::setAnnData is not thread-safe with Index::addItems */ std::unique_lock templock(appr_alg->global); @@ -641,7 +641,7 @@ PYBIND11_PLUGIN(hnswlib) { py::class_>(m, "Index") .def(py::init(&Index::createFromParams), py::arg("params")) - /* WARNING: Index::createFromIndex is not thread-safe with Index::addItems */ + /* WARNING: Index::createFromIndex is not thread-safe with Index::addItems */ .def(py::init(&Index::createFromIndex), py::arg("index")) .def(py::init(), py::arg("space"), py::arg("dim")) .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, py::arg("ef_construction")=200, py::arg("random_seed")=100) @@ -683,7 +683,7 @@ PYBIND11_PLUGIN(hnswlib) { .def(py::pickle( [](const Index &ind) { // __getstate__ /* Return a tuple that fully encodes the state of the object */ - /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ + /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ return ind.getIndexParams(); }, [](py::tuple t) { // __setstate__ From a64a00168ad74bc9c1098d3916ff868c78e6b345 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Fri, 6 Nov 2020 17:41:56 +0300 Subject: [PATCH 30/58] Fixes of some typos in readme --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 559c5dfd..cee129d8 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,9 @@ Header-only C++ HNSW implementation with python bindings. Paper's code for the H **NEWS:** -* **Thanks to Apoorv Sharma [@apoorv-sharma](https://github.com/apoorv-sharma), hnswlib now supports true element updates (the interface remained the same, but when you the perfromance/memory should not degrade as you update the element embeddinds).** +* **Thanks to Apoorv Sharma [@apoorv-sharma](https://github.com/apoorv-sharma), hnswlib now supports true element updates (the interface remained the same, but when you the perfromance/memory should not degrade as you update the element embeddings).** -* **Thanks to Dmitry [@2ooom](https://github.com/2ooom), hnswlib got a boost in performance for vector dimensions that are not mutiple of 4** +* **Thanks to Dmitry [@2ooom](https://github.com/2ooom), hnswlib got a boost in performance for vector dimensions that are not multiple of 4** * **Thanks to Louis Abraham ([@louisabraham](https://github.com/louisabraham)) hnswlib can now be installed via pip!** @@ -49,14 +49,14 @@ Index methods: * `data_labels` specifies the labels for the data. If index already has the elements with the same labels, their features will be updated. Note that update procedure is slower than insertion of a new element, but more memory- and query-efficient. * Thread-safe with other `add_items` calls, but not with `knn_query`. -* `mark_deleted(data_label)` - marks the element as deleted, so it will be ommited from search results. +* `mark_deleted(data_label)` - marks the element as deleted, so it will be omitted from search results. * `resize_index(new_size)` - changes the maximum capacity of the index. Not thread safe with `add_items` and `knn_query`. * `set_ef(ef)` - sets the query time accuracy/speed trade-off, defined by the `ef` parameter ( [ALGO_PARAMS.md](ALGO_PARAMS.md)). Note that the parameter is currently not saved along with the index, so you need to set it manually after loading. -* `knn_query(data, k = 1, num_threads = -1)` make a batch query for `k` closests elements for each element of the +* `knn_query(data, k = 1, num_threads = -1)` make a batch query for `k` closest elements for each element of the * `data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`). * `num_threads` sets the number of cpu threads to use (-1 means use default). * Thread-safe with other `knn_query` calls, but not with `add_items`. @@ -191,7 +191,7 @@ or you can install via pip: ### Other implementations * Non-metric space library (nmslib) - main library(python, C++), supports exotic distances: https://github.com/nmslib/nmslib -* Faiss libary by facebook, uses own HNSW implementation for coarse quantization (python, C++): +* Faiss library by facebook, uses own HNSW implementation for coarse quantization (python, C++): https://github.com/facebookresearch/faiss * Code for the paper ["Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors"](https://arxiv.org/abs/1802.02422) From ec38db1b80ab5b2a03c9c4d3457e330c314903de Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Wed, 18 Nov 2020 14:29:00 +0300 Subject: [PATCH 31/58] Rename space_name to space on the python side --- python_bindings/bindings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index d9396247..bbfa9f8a 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -655,7 +655,7 @@ PYBIND11_PLUGIN(hnswlib) { .def("load_index", &Index::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0) .def("mark_deleted", &Index::markDeleted, py::arg("label")) .def("resize_index", &Index::resizeIndex, py::arg("new_size")) - .def_readonly("space_name", &Index::space_name) + .def_readonly("space", &Index::space_name) .def_readonly("dim", &Index::dim) .def_readwrite("num_threads", &Index::num_threads_default) .def_property("ef", From a0c207650a1d9fc8535f516213164d7ba450a678 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Wed, 18 Nov 2020 14:42:01 +0300 Subject: [PATCH 32/58] Add gitignore file to ignore build folders --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..ff7acee9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +python_bindings/hnswlib.egg-info/ +python_bindings/build/ +python_bindings/dist/ +python_bindings/tmp/ From ded26fc70d14aaa2c55981850415b25007b6e2d4 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 23 Nov 2020 06:19:37 +0000 Subject: [PATCH 33/58] use dict for Index serialization --- python_bindings/bindings.cpp | 264 +++++++++++++++++++++-------------- 1 file changed, 161 insertions(+), 103 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index bbfa9f8a..5ae7fcd8 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -98,7 +98,7 @@ class Index { default_ef=10; } - + static const int ser_version = 1; // serialization version std::string space_name; @@ -285,8 +285,10 @@ class Index { } - py::tuple getAnnData() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ - + py::dict getAnnData() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + + + std::unique_lock templock(appr_alg->global); unsigned int level0_npy_size = appr_alg->cur_element_count * appr_alg->size_data_per_element_; @@ -345,140 +347,197 @@ class Index { delete[] f; }); - return py::make_tuple(appr_alg->offsetLevel0_, - appr_alg->max_elements_, - appr_alg->cur_element_count, - appr_alg->size_data_per_element_, - appr_alg->label_offset_, - appr_alg->offsetData_, - appr_alg->maxlevel_, - appr_alg->enterpoint_node_, - appr_alg->maxM_, - appr_alg->maxM0_, - appr_alg->M_, - appr_alg->mult_, - appr_alg->ef_construction_, - appr_alg->ef_, - appr_alg->has_deletions_, - appr_alg->size_links_per_element_, - py::array_t( - {appr_alg->label_lookup_.size()}, // shape - {sizeof(hnswlib::labeltype)}, // C-style contiguous strides for double - label_lookup_key_npy, // the data pointer - free_when_done_lb), - py::array_t( - {appr_alg->label_lookup_.size()}, // shape - {sizeof(hnswlib::tableint)}, // C-style contiguous strides for double - label_lookup_val_npy, // the data pointer - free_when_done_id), - py::array_t( - {appr_alg->element_levels_.size()}, // shape - {sizeof(int)}, // C-style contiguous strides for double - element_levels_npy, // the data pointer - free_when_done_lvl), - py::array_t( - {level0_npy_size}, // shape - {sizeof(char)}, // C-style contiguous strides for double - data_level0_npy, // the data pointer - free_when_done_l0), - py::array_t( - {link_npy_size}, // shape - {sizeof(char)}, // C-style contiguous strides for double - link_list_npy, // the data pointer - free_when_done_ll) - ); + /* TODO: serialize state of random generators appr_alg->level_generator_ and appr_alg->update_probability_generator_ */ + /* for full reproducibility / to avoid re-initializing generators inside Index::createFromParams */ + + return py::dict( + "offset_level0"_a=appr_alg->offsetLevel0_, + "max_elements"_a=appr_alg->max_elements_, + "cur_element_count"_a=appr_alg->cur_element_count, + "size_data_per_element"_a=appr_alg->size_data_per_element_, + "label_offset"_a=appr_alg->label_offset_, + "offset_data"_a=appr_alg->offsetData_, + "max_level"_a=appr_alg->maxlevel_, + "enterpoint_node"_a=appr_alg->enterpoint_node_, + "max_M"_a=appr_alg->maxM_, + "max_M0"_a=appr_alg->maxM0_, + "M"_a=appr_alg->M_, + "mult"_a=appr_alg->mult_, + "ef_construction"_a=appr_alg->ef_construction_, + "ef"_a=appr_alg->ef_, + "has_deletions"_a=appr_alg->has_deletions_, + "size_links_per_element"_a=appr_alg->size_links_per_element_, + + "label_lookup_external"_a=py::array_t( + {appr_alg->label_lookup_.size()}, // shape + {sizeof(hnswlib::labeltype)}, // C-style contiguous strides for double + label_lookup_key_npy, // the data pointer + free_when_done_lb), + + "label_lookup_internal"_a=py::array_t( + {appr_alg->label_lookup_.size()}, // shape + {sizeof(hnswlib::tableint)}, // C-style contiguous strides for double + label_lookup_val_npy, // the data pointer + free_when_done_id), + + "element_levels"_a=py::array_t( + {appr_alg->element_levels_.size()}, // shape + {sizeof(int)}, // C-style contiguous strides for double + element_levels_npy, // the data pointer + free_when_done_lvl), + + // linkLists_,element_levels_,data_level0_memory_ + "data_level0"_a=py::array_t( + {level0_npy_size}, // shape + {sizeof(char)}, // C-style contiguous strides for double + data_level0_npy, // the data pointer + free_when_done_l0), + + "link_lists"_a=py::array_t( + {link_npy_size}, // shape + {sizeof(char)}, // C-style contiguous strides for double + link_list_npy, // the data pointer + free_when_done_ll) + + ); + + // return py::make_tuple(appr_alg->offsetLevel0_, + // appr_alg->max_elements_, + // appr_alg->cur_element_count, + // appr_alg->size_data_per_element_, + // appr_alg->label_offset_, + // appr_alg->offsetData_, + // appr_alg->maxlevel_, + // appr_alg->enterpoint_node_, + // appr_alg->maxM_, + // appr_alg->maxM0_, + // appr_alg->M_, + // appr_alg->mult_, + // appr_alg->ef_construction_, + // appr_alg->ef_, + // appr_alg->has_deletions_, + // appr_alg->size_links_per_element_, + // py::array_t( + // {appr_alg->label_lookup_.size()}, // shape + // {sizeof(hnswlib::labeltype)}, // C-style contiguous strides for double + // label_lookup_key_npy, // the data pointer + // free_when_done_lb), + // py::array_t( + // {appr_alg->label_lookup_.size()}, // shape + // {sizeof(hnswlib::tableint)}, // C-style contiguous strides for double + // label_lookup_val_npy, // the data pointer + // free_when_done_id), + // py::array_t( + // {appr_alg->element_levels_.size()}, // shape + // {sizeof(int)}, // C-style contiguous strides for double + // element_levels_npy, // the data pointer + // free_when_done_lvl), + // py::array_t( + // {level0_npy_size}, // shape + // {sizeof(char)}, // C-style contiguous strides for double + // data_level0_npy, // the data pointer + // free_when_done_l0), + // py::array_t( + // {link_npy_size}, // shape + // {sizeof(char)}, // C-style contiguous strides for double + // link_list_npy, // the data pointer + // free_when_done_ll) + // ); } - py::tuple getIndexParams() const { - /* TODO: serialize state of random generators appr_alg->level_generator_ and appr_alg->update_probability_generator_ */ - /* for full reproducibility / to avoid re-initializing generators inside Index::createFromParams */ - - return py::make_tuple(py::int_(Index::ser_version), // serialization version - - /* TODO: convert the following two py::tuple's to py::dict */ - py::make_tuple(space_name, dim, index_inited, ep_added, normalize, num_threads_default, seed, default_ef), - index_inited == true ? getAnnData() : py::make_tuple()); /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ - - + py::tuple getIndexParams() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + auto params = py::dict( + "ser_version"_a=py::int_(Index::ser_version), //serialization version + "space"_a=space_name, + "dim"_a=dim, + "index_inited"_a=index_inited, + "ep_added"_a=ep_added, + "normalize"_a=normalize, + "num_threads"_a=num_threads_default, + "seed"_a=seed, + "ef"_a=default_ef + ); + + if(index_inited == false) + return params; + + auto ann_params = getAnnData(); + + return py::dict(**params, **ann_params); } - static Index * createFromParams(const py::tuple t) { - - if (py::int_(Index::ser_version) != t[0].cast()) // check serialization version - throw std::runtime_error("Serialization version mismatch!"); + static Index * createFromParams(const py::dict d) { - py::tuple index_params=t[1].cast(); /* TODO: convert index_params from py::tuple to py::dict */ - py::tuple ann_params=t[2].cast(); /* TODO: convert ann_params from py::tuple to py::dict */ + // check serialization version + assert_true(py::int_(Index::ser_version) >= d["ser_version"].cast(), "Invalid serialization version!"); - auto space_name_=index_params[0].cast(); - auto dim_=index_params[1].cast(); - auto index_inited_=index_params[2].cast(); + auto space_name_=d["space"].cast(); + auto dim_=d["dim"].cast(); + auto index_inited_=d["index_inited"].cast(); - Index *new_index = new Index(index_params[0].cast(), index_params[1].cast()); + Index *new_index = new Index(space_name_, dim_); /* TODO: deserialize state of random generators into new_index->level_generator_ and new_index->update_probability_generator_ */ /* for full reproducibility / state of generators is serialized inside Index::getIndexParams */ - new_index->seed = index_params[6].cast(); + new_index->seed = d["seed"].cast(); if (index_inited_){ - new_index->appr_alg = new hnswlib::HierarchicalNSW(new_index->l2space, ann_params[1].cast(), ann_params[10].cast(), ann_params[12].cast(), new_index->seed); - new_index->cur_l = ann_params[2].cast(); + new_index->appr_alg = new hnswlib::HierarchicalNSW(new_index->l2space, d["max_elements"].cast(), d["M"].cast(), d["ef_construction"].cast(), new_index->seed); + new_index->cur_l = d["cur_element_count"].cast(); } new_index->index_inited = index_inited_; - new_index->ep_added=index_params[3].cast(); - new_index->num_threads_default=index_params[5].cast(); - new_index->default_ef=index_params[7].cast(); + new_index->ep_added=d["ep_added"].cast(); + new_index->num_threads_default=d["num_threads"].cast(); + new_index->default_ef=d["ef"].cast(); if (index_inited_) new_index->setAnnData(ann_params); - return new_index; } static Index * createFromIndex(const Index & index) { - /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ - return createFromParams(index.getIndexParams()); + return createFromParams(index.getIndexParams()); } - - void setAnnData(const py::tuple t) { - /* WARNING: Index::setAnnData is not thread-safe with Index::addItems */ - + void setAnnData(const py::dict d) { /* WARNING: Index::setAnnData is not thread-safe with Index::addItems */ + + std::unique_lock templock(appr_alg->global); - assert_true(appr_alg->offsetLevel0_ == t[0].cast(), "Invalid value of offsetLevel0_ "); - assert_true(appr_alg->max_elements_ == t[1].cast(), "Invalid value of max_elements_ "); + assert_true(appr_alg->offsetLevel0_ == d["offset_level0"].cast(), "Invalid value of offsetLevel0_ "); + assert_true(appr_alg->max_elements_ == d["max_elements"].cast(), "Invalid value of max_elements_ "); + + appr_alg->cur_element_count = d["cur_element_count"].cast(); - appr_alg->cur_element_count = t[2].cast(); + assert_true(appr_alg->size_data_per_element_ == d["size_data_per_element"].cast(), "Invalid value of size_data_per_element_ "); + assert_true(appr_alg->label_offset_ == d["label_offset"].cast(), "Invalid value of label_offset_ "); + assert_true(appr_alg->offsetData_ == d["offset_data"].cast(), "Invalid value of offsetData_ "); - assert_true(appr_alg->size_data_per_element_ == t[3].cast(), "Invalid value of size_data_per_element_ "); - assert_true(appr_alg->label_offset_ == t[4].cast(), "Invalid value of label_offset_ "); - assert_true(appr_alg->offsetData_ == t[5].cast(), "Invalid value of offsetData_ "); + appr_alg->maxlevel_ = d["max_level"].cast(); + appr_alg->enterpoint_node_ = d["enterpoint_node"].cast(); - appr_alg->maxlevel_ = t[6].cast(); - appr_alg->enterpoint_node_ = t[7].cast(); + assert_true(appr_alg->maxM_ == d["max_M"].cast(), "Invalid value of maxM_ "); + assert_true(appr_alg->maxM0_ == d["max_M0"].cast(), "Invalid value of maxM0_ "); + assert_true(appr_alg->M_ == d["M"].cast(), "Invalid value of M_ "); + assert_true(appr_alg->mult_ == d["mult"].cast(), "Invalid value of mult_ "); + assert_true(appr_alg->ef_construction_ == d["ef_construction"].cast(), "Invalid value of ef_construction_ "); - assert_true(appr_alg->maxM_ == t[8].cast(), "Invalid value of maxM_ "); - assert_true(appr_alg->maxM0_ == t[9].cast(), "Invalid value of maxM0_ "); - assert_true(appr_alg->M_ == t[10].cast(), "Invalid value of M_ "); - assert_true(appr_alg->mult_ == t[11].cast(), "Invalid value of mult_ "); - assert_true(appr_alg->ef_construction_ == t[12].cast(), "Invalid value of ef_construction_ "); + appr_alg->ef_ = d["ef"].cast(); + appr_alg->has_deletions_=d["has_deletions"].cast(); - appr_alg->ef_ = t[13].cast(); - appr_alg->has_deletions_=t[14].cast(); - assert_true(appr_alg->size_links_per_element_ == t[15].cast(), "Invalid value of size_links_per_element_ "); + assert_true(appr_alg->size_links_per_element_ == d["size_links_per_element"].cast(), "Invalid value of size_links_per_element_ "); - auto label_lookup_key_npy = t[16].cast >(); - auto label_lookup_val_npy = t[17].cast >(); - auto element_levels_npy = t[18].cast >(); - auto data_level0_npy = t[19].cast >(); - auto link_list_npy = t[20].cast >(); + auto label_lookup_key_npy = d["label_lookup_external"].cast >(); + auto label_lookup_val_npy = d["label_lookup_internal"].cast >(); + auto element_levels_npy = d["element_levels"].cast >(); + auto data_level0_npy = d["data_level0"].cast >(); + auto link_list_npy = d["link_lists"].cast >(); for (size_t i = 0; i < appr_alg->cur_element_count; i++){ if (label_lookup_val_npy.data()[i] < 0){ @@ -516,7 +575,6 @@ class Index { } } - } py::object knnQuery_return_numpy(py::object input, size_t k = 1, int num_threads = -1) { @@ -640,9 +698,9 @@ PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); py::class_>(m, "Index") - .def(py::init(&Index::createFromParams), py::arg("params")) + .def(py::init(&Index::createFromParams), py::arg("params")) /* WARNING: Index::createFromIndex is not thread-safe with Index::addItems */ - .def(py::init(&Index::createFromIndex), py::arg("index")) + .def(py::init(&Index::createFromIndex), py::arg("index")) .def(py::init(), py::arg("space"), py::arg("dim")) .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, py::arg("ef_construction")=200, py::arg("random_seed")=100) .def("knn_query", &Index::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1, py::arg("num_threads")=-1) From e845d8afc4978168be255bb9320c0eb73ed4eed1 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Wed, 25 Nov 2020 08:04:52 +0000 Subject: [PATCH 34/58] debugging; have to wrap state dict into a tuple --- python_bindings/bindings.cpp | 84 +++++++++--------------------------- 1 file changed, 20 insertions(+), 64 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 5ae7fcd8..d81ac1f5 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -9,6 +9,7 @@ #include namespace py = pybind11; +using namespace pybind11::literals; // needed to bring in _a literal /* * replacement for the openmp '#pragma omp parallel for' directive @@ -73,6 +74,12 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn } + inline void assert_true(bool expr, const std::string & msg) { + if (expr == false) + throw std::runtime_error("assert failed: "+msg); + return; + } + template @@ -278,12 +285,6 @@ class Index { return ids; } - inline void assert_true(bool expr, const std::string & msg) { - if (expr == false) - throw std::runtime_error("assert failed: "+msg); - return; - } - py::dict getAnnData() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ @@ -401,54 +402,12 @@ class Index { ); - // return py::make_tuple(appr_alg->offsetLevel0_, - // appr_alg->max_elements_, - // appr_alg->cur_element_count, - // appr_alg->size_data_per_element_, - // appr_alg->label_offset_, - // appr_alg->offsetData_, - // appr_alg->maxlevel_, - // appr_alg->enterpoint_node_, - // appr_alg->maxM_, - // appr_alg->maxM0_, - // appr_alg->M_, - // appr_alg->mult_, - // appr_alg->ef_construction_, - // appr_alg->ef_, - // appr_alg->has_deletions_, - // appr_alg->size_links_per_element_, - // py::array_t( - // {appr_alg->label_lookup_.size()}, // shape - // {sizeof(hnswlib::labeltype)}, // C-style contiguous strides for double - // label_lookup_key_npy, // the data pointer - // free_when_done_lb), - // py::array_t( - // {appr_alg->label_lookup_.size()}, // shape - // {sizeof(hnswlib::tableint)}, // C-style contiguous strides for double - // label_lookup_val_npy, // the data pointer - // free_when_done_id), - // py::array_t( - // {appr_alg->element_levels_.size()}, // shape - // {sizeof(int)}, // C-style contiguous strides for double - // element_levels_npy, // the data pointer - // free_when_done_lvl), - // py::array_t( - // {level0_npy_size}, // shape - // {sizeof(char)}, // C-style contiguous strides for double - // data_level0_npy, // the data pointer - // free_when_done_l0), - // py::array_t( - // {link_npy_size}, // shape - // {sizeof(char)}, // C-style contiguous strides for double - // link_list_npy, // the data pointer - // free_when_done_ll) - // ); } - py::tuple getIndexParams() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ - auto params = py::dict( + py::dict getIndexParams() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + auto params = py::dict( "ser_version"_a=py::int_(Index::ser_version), //serialization version "space"_a=space_name, "dim"_a=dim, @@ -456,17 +415,15 @@ class Index { "ep_added"_a=ep_added, "normalize"_a=normalize, "num_threads"_a=num_threads_default, - "seed"_a=seed, - "ef"_a=default_ef + "seed"_a=seed ); - if(index_inited == false) - return params; - - auto ann_params = getAnnData(); + if(index_inited == false) + return py::dict( **params, "ef"_a=default_ef); - return py::dict(**params, **ann_params); + auto ann_params = getAnnData(); + return py::dict(**params, **ann_params); } @@ -496,7 +453,7 @@ class Index { new_index->default_ef=d["ef"].cast(); if (index_inited_) - new_index->setAnnData(ann_params); + new_index->setAnnData(d); return new_index; } @@ -739,15 +696,14 @@ PYBIND11_PLUGIN(hnswlib) { }) .def(py::pickle( - [](const Index &ind) { // __getstate__ - /* Return a tuple that fully encodes the state of the object */ - /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ - return ind.getIndexParams(); + [](const Index &ind) { // __getstate__ + return py::make_tuple(ind.getIndexParams()); /* Return dict (wrapped in a tuple) that fully encodes state of the Index object */ }, [](py::tuple t) { // __setstate__ - if (t.size() != 3) + if (t.size() != 1) throw std::runtime_error("Invalid state!"); - return Index::createFromParams(t); + + return Index::createFromParams(t[0].cast()); } )) From 6425debb9fded24d7771d67278693ce0b7457724 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Sat, 28 Nov 2020 18:33:03 +0300 Subject: [PATCH 35/58] Move setup.py into root folder to fix bindings build when symlink doesn't work in windows, refactoring of bindings tests --- .gitignore | 9 +++++---- .travis.yml | 2 -- python_bindings/MANIFEST.in => MANIFEST.in | 0 python_bindings/Makefile => Makefile | 2 +- python_bindings/bindings.cpp | 2 +- python_bindings/hnswlib | 1 - python_bindings/tests/bindings_test.py | 11 +++++++---- python_bindings/tests/bindings_test_labels.py | 17 +++++++++++------ .../requirements.txt => requirements.txt | 0 python_bindings/setup.py => setup.py | 7 ++++--- 10 files changed, 29 insertions(+), 22 deletions(-) rename python_bindings/MANIFEST.in => MANIFEST.in (100%) rename python_bindings/Makefile => Makefile (56%) delete mode 120000 python_bindings/hnswlib rename python_bindings/requirements.txt => requirements.txt (100%) rename python_bindings/setup.py => setup.py (95%) diff --git a/.gitignore b/.gitignore index ff7acee9..ef2b9e50 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ -python_bindings/hnswlib.egg-info/ -python_bindings/build/ -python_bindings/dist/ -python_bindings/tmp/ +hnswlib.egg-info/ +build/ +dist/ +python_bindings/tests/__pycache__/ +*.pyd diff --git a/.travis.yml b/.travis.yml index 6b194926..587c4146 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,11 +6,9 @@ matrix: - python: 3.7 install: - | - cd python_bindings pip install -r requirements.txt python setup.py install script: - | - cd python_bindings python setup.py test diff --git a/python_bindings/MANIFEST.in b/MANIFEST.in similarity index 100% rename from python_bindings/MANIFEST.in rename to MANIFEST.in diff --git a/python_bindings/Makefile b/Makefile similarity index 56% rename from python_bindings/Makefile rename to Makefile index 02ec523b..1420b7c0 100644 --- a/python_bindings/Makefile +++ b/Makefile @@ -9,6 +9,6 @@ test: python3 setup.py test clean: - rm -rf *.egg-info build dist var first_half.bin tests/__pycache__ hnswlib.cpython-36m-darwin.so + rm -rf *.egg-info build dist var tests/__pycache__ hnswlib.cpython*.so .PHONY: dist \ No newline at end of file diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index bbfa9f8a..7875ec6e 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -2,7 +2,7 @@ #include #include #include -#include "hnswlib/hnswlib.h" +#include "hnswlib.h" #include #include #include diff --git a/python_bindings/hnswlib b/python_bindings/hnswlib deleted file mode 120000 index 236d6575..00000000 --- a/python_bindings/hnswlib +++ /dev/null @@ -1 +0,0 @@ -../hnswlib \ No newline at end of file diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index afc663af..009b2164 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -1,3 +1,4 @@ +import os import unittest @@ -43,16 +44,16 @@ def testRandomSelf(self): self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) # Serializing and deleting the index: - index_path='first_half.bin' + index_path = 'first_half.bin' print("Saving index to '%s'" % index_path) - p.save_index("first_half.bin") + p.save_index(index_path) del p # Reiniting, loading the index p = hnswlib.Index(space='l2', dim=dim) # you can change the sa - print("\nLoading index from 'first_half.bin'\n") - p.load_index("first_half.bin") + print("\nLoading index from '%s'\n" % index_path) + p.load_index(index_path) print("Adding the second batch of %d elements" % (len(data2))) p.add_items(data2) @@ -61,6 +62,8 @@ def testRandomSelf(self): labels, distances = p.knn_query(data, k=1) self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) + + os.remove(index_path) if __name__ == "__main__": diff --git a/python_bindings/tests/bindings_test_labels.py b/python_bindings/tests/bindings_test_labels.py index c1887bef..e44b0988 100644 --- a/python_bindings/tests/bindings_test_labels.py +++ b/python_bindings/tests/bindings_test_labels.py @@ -1,3 +1,4 @@ +import os import unittest @@ -56,9 +57,9 @@ def testRandomSelf(self): # Serializing and deleting the index. # We need the part to check that serialization is working properly. - index_path='first_half.bin' + index_path = 'first_half.bin' print("Saving index to '%s'" % index_path) - p.save_index("first_half.bin") + p.save_index(index_path) print("Saved. Deleting...") del p print("Deleted") @@ -68,8 +69,8 @@ def testRandomSelf(self): print("Reiniting") p = hnswlib.Index(space='l2', dim=dim) - print("\nLoading index from 'first_half.bin'\n") - p.load_index("first_half.bin") + print("\nLoading index from '%s'\n" % index_path) + p.load_index(index_path) p.set_ef(100) print("Adding the second batch of %d elements" % (len(data2))) @@ -109,9 +110,10 @@ def testRandomSelf(self): print("All the data in data1 are removed") # checking saving/loading index with elements marked as deleted - p.save_index("with_deleted.bin") + del_index_path = "with_deleted.bin" + p.save_index(del_index_path) p = hnswlib.Index(space='l2', dim=dim) - p.load_index("with_deleted.bin") + p.load_index(del_index_path) p.set_ef(100) labels1_after, _ = p.knn_query(data1, k=1) @@ -119,6 +121,9 @@ def testRandomSelf(self): for lb in labels1: if la[0] == lb[0]: self.assertTrue(False) + + os.remove(index_path) + os.remove(del_index_path) diff --git a/python_bindings/requirements.txt b/requirements.txt similarity index 100% rename from python_bindings/requirements.txt rename to requirements.txt diff --git a/python_bindings/setup.py b/setup.py similarity index 95% rename from python_bindings/setup.py rename to setup.py index a6dfb81b..28a644f6 100644 --- a/python_bindings/setup.py +++ b/setup.py @@ -7,7 +7,8 @@ __version__ = '0.4.0' -source_files = ['bindings.cpp'] +source_files = ['./python_bindings/bindings.cpp'] +include_dirs = ['./hnswlib/'] libraries = [] extra_objects = [] @@ -17,7 +18,7 @@ Extension( 'hnswlib', source_files, - # include_dirs=[os.path.join(libdir, "include")], + include_dirs=include_dirs, libraries=libraries, language='c++', extra_objects=extra_objects, @@ -112,6 +113,6 @@ def build_extensions(self): ext_modules=ext_modules, install_requires=['pybind11>=2.0', 'numpy'], cmdclass={'build_ext': BuildExt}, - test_suite="tests", + test_suite="python_bindings.tests", zip_safe=False, ) From 376c8cdccff182db17a9c8a5d8254af4a5525220 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Sat, 28 Nov 2020 18:59:50 +0300 Subject: [PATCH 36/58] Update gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index ef2b9e50..c4045e98 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ hnswlib.egg-info/ build/ dist/ +tmp/ python_bindings/tests/__pycache__/ *.pyd +hnswlib.cpython*.so From 68a8a368e164fce34932dbdf9f5d192b66502e6b Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Sat, 28 Nov 2020 19:09:30 +0300 Subject: [PATCH 37/58] Update Makefile to clean tmp folder --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1420b7c0..792b246e 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,6 @@ test: python3 setup.py test clean: - rm -rf *.egg-info build dist var tests/__pycache__ hnswlib.cpython*.so + rm -rf *.egg-info build dist tmp var tests/__pycache__ hnswlib.cpython*.so .PHONY: dist \ No newline at end of file From 19abf9b0d3a207d5e698bb5f6ea5ee781fe16fc8 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Sat, 28 Nov 2020 19:27:43 +0300 Subject: [PATCH 38/58] Update readme --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index d8d0eb76..aab5690d 100644 --- a/README.md +++ b/README.md @@ -214,7 +214,6 @@ You can install from sources: ```bash apt-get install -y python-setuptools python-pip pip3 install pybind11 numpy setuptools -cd python_bindings python3 setup.py install ``` From 2799aabe29200a64a4432e20ebe6e70fa3ac42b1 Mon Sep 17 00:00:00 2001 From: Dmitriy Bespalov Date: Mon, 30 Nov 2020 00:07:16 -0500 Subject: [PATCH 39/58] clean assert error message --- python_bindings/bindings.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index d81ac1f5..33102166 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -9,7 +9,7 @@ #include namespace py = pybind11; -using namespace pybind11::literals; // needed to bring in _a literal +using namespace pybind11::literals; // needed to bring in _a literal /* * replacement for the openmp '#pragma omp parallel for' directive @@ -76,7 +76,7 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn inline void assert_true(bool expr, const std::string & msg) { if (expr == false) - throw std::runtime_error("assert failed: "+msg); + throw std::runtime_error("Unpickle Error: "+msg); return; } @@ -696,7 +696,7 @@ PYBIND11_PLUGIN(hnswlib) { }) .def(py::pickle( - [](const Index &ind) { // __getstate__ + [](const Index &ind) { // __getstate__ return py::make_tuple(ind.getIndexParams()); /* Return dict (wrapped in a tuple) that fully encodes state of the Index object */ }, [](py::tuple t) { // __setstate__ From 4c002bceb8ee097b885865958b3c1ede00cd9d14 Mon Sep 17 00:00:00 2001 From: Dmitriy Bespalov Date: Mon, 30 Nov 2020 00:26:13 -0500 Subject: [PATCH 40/58] fix compilation error on osx --- python_bindings/bindings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 33102166..ac2669fd 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -430,7 +430,7 @@ class Index { static Index * createFromParams(const py::dict d) { // check serialization version - assert_true(py::int_(Index::ser_version) >= d["ser_version"].cast(), "Invalid serialization version!"); + assert_true(((int)py::int_(Index::ser_version)) >= d["ser_version"].cast(), "Invalid serialization version!"); auto space_name_=d["space"].cast(); auto dim_=d["dim"].cast(); From 5b2585d76221be0e41217698eaa9e3cc04a21d66 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Tue, 1 Dec 2020 16:15:02 +0300 Subject: [PATCH 41/58] Revert symlink to hnswlib and add windows to build matrix --- .travis.yml | 12 ++++++++---- python_bindings/hnswlib | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 python_bindings/hnswlib diff --git a/.travis.yml b/.travis.yml index 587c4146..01ee109e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,13 @@ language: python -matrix: - include: - - python: 3.6 - - python: 3.7 +os: + - linux + - windows + +python: + - '3.6' + - '3.7' + install: - | pip install -r requirements.txt diff --git a/python_bindings/hnswlib b/python_bindings/hnswlib new file mode 100644 index 00000000..236d6575 --- /dev/null +++ b/python_bindings/hnswlib @@ -0,0 +1 @@ +../hnswlib \ No newline at end of file From dda9b31230f759dd6a674f41e85a0e3186cbacee Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Tue, 1 Dec 2020 16:53:50 +0300 Subject: [PATCH 42/58] Fix symlink --- python_bindings/hnswlib | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 120000 python_bindings/hnswlib diff --git a/python_bindings/hnswlib b/python_bindings/hnswlib deleted file mode 100644 index 236d6575..00000000 --- a/python_bindings/hnswlib +++ /dev/null @@ -1 +0,0 @@ -../hnswlib \ No newline at end of file diff --git a/python_bindings/hnswlib b/python_bindings/hnswlib new file mode 120000 index 00000000..236d6575 --- /dev/null +++ b/python_bindings/hnswlib @@ -0,0 +1 @@ +../hnswlib \ No newline at end of file From b1994a5a14453c9d1fc4ee9b1a36429f28478581 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Tue, 1 Dec 2020 17:10:19 +0300 Subject: [PATCH 43/58] Update travis --- .travis.yml | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 01ee109e..f484b517 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,12 +1,30 @@ language: python -os: - - linux - - windows - -python: - - '3.6' - - '3.7' +jobs: + include: + - name: Linux Python 3.6 + os: linux + python: 3.6 + + - name: Linux Python 3.7 + os: linux + python: 3.7 + + - name: Windows Python 3.6 + os: windows + language: shell # 'language: python' is an error on Travis CI Windows + before_install: + - choco install python --version 3.6.0 + - python -m pip install --upgrade pip + env: PATH=/c/Python36:/c/Python36/Scripts:$PATH + + - name: Windows Python 3.7 + os: windows + language: shell # 'language: python' is an error on Travis CI Windows + before_install: + - choco install python --version 3.7.0 + - python -m pip install --upgrade pip + env: PATH=/c/Python37:/c/Python37/Scripts:$PATH install: - | From afd18d2edde0b3176d5eacc8875f52359be1eb12 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Tue, 1 Dec 2020 17:34:13 +0300 Subject: [PATCH 44/58] Update travis --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index f484b517..893441e9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,6 +16,7 @@ jobs: before_install: - choco install python --version 3.6.0 - python -m pip install --upgrade pip + - python --version env: PATH=/c/Python36:/c/Python36/Scripts:$PATH - name: Windows Python 3.7 @@ -24,6 +25,7 @@ jobs: before_install: - choco install python --version 3.7.0 - python -m pip install --upgrade pip + - python --version env: PATH=/c/Python37:/c/Python37/Scripts:$PATH install: From 6efa48c0003bf797253eb798e53a01aeebf0854f Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Tue, 8 Dec 2020 21:39:27 +0300 Subject: [PATCH 45/58] Add symlink to setup.py instead of hnswlib --- python_bindings/hnswlib | 1 - python_bindings/setup.py | 1 + setup.py | 10 ++++++++-- 3 files changed, 9 insertions(+), 3 deletions(-) delete mode 120000 python_bindings/hnswlib create mode 120000 python_bindings/setup.py diff --git a/python_bindings/hnswlib b/python_bindings/hnswlib deleted file mode 120000 index 236d6575..00000000 --- a/python_bindings/hnswlib +++ /dev/null @@ -1 +0,0 @@ -../hnswlib \ No newline at end of file diff --git a/python_bindings/setup.py b/python_bindings/setup.py new file mode 120000 index 00000000..f8f80fc2 --- /dev/null +++ b/python_bindings/setup.py @@ -0,0 +1 @@ +../setup.py \ No newline at end of file diff --git a/setup.py b/setup.py index 28a644f6..002f3893 100644 --- a/setup.py +++ b/setup.py @@ -6,9 +6,15 @@ __version__ = '0.4.0' +# compatibility when run in python_bindings +bindings_dir = 'python_bindings' +if bindings_dir in os.path.basename(os.getcwd()): + source_files = ['./bindings.cpp'] + include_dirs = ['../hnswlib/'] +else: + source_files = ['./python_bindings/bindings.cpp'] + include_dirs = ['./hnswlib/'] -source_files = ['./python_bindings/bindings.cpp'] -include_dirs = ['./hnswlib/'] libraries = [] extra_objects = [] From 9fe639d71f3dc3dd793723395a7510258bf698bb Mon Sep 17 00:00:00 2001 From: uestc-lfs Date: Sun, 13 Dec 2020 00:22:59 +0800 Subject: [PATCH 46/58] fix interface --- examples/searchKnnCloserFirst_test.cpp | 84 ++++++++++++++++++++++++++ hnswlib/bruteforce.h | 18 ------ hnswlib/hnswalg.h | 13 ---- hnswlib/hnswlib.h | 25 +++++++- 4 files changed, 107 insertions(+), 33 deletions(-) create mode 100644 examples/searchKnnCloserFirst_test.cpp diff --git a/examples/searchKnnCloserFirst_test.cpp b/examples/searchKnnCloserFirst_test.cpp new file mode 100644 index 00000000..cc1392c8 --- /dev/null +++ b/examples/searchKnnCloserFirst_test.cpp @@ -0,0 +1,84 @@ +// This is a test file for testing the interface +// >>> virtual std::vector> +// >>> searchKnnCloserFirst(const void* query_data, size_t k) const; +// of class AlgorithmInterface + +#include "../hnswlib/hnswlib.h" + +#include + +#include +#include + +namespace +{ + +using idx_t = hnswlib::labeltype; + +void test() { + int d = 4; + idx_t n = 100; + idx_t nq = 10; + size_t k = 10; + + std::vector data(n * d); + std::vector query(nq * d); + + std::mt19937 rng; + rng.seed(47); + std::uniform_real_distribution<> distrib; + + for (idx_t i = 0; i < n * d; ++i) { + data[i] = distrib(rng); + } + for (idx_t i = 0; i < nq * d; ++i) { + query[i] = distrib(rng); + } + + + hnswlib::L2Space space(d); + hnswlib::AlgorithmInterface* alg_brute = new hnswlib::BruteforceSearch(&space, 2 * n); + hnswlib::AlgorithmInterface* alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n); + + for (size_t i = 0; i < n; ++i) { + alg_brute->addPoint(data.data() + d * i, i); + alg_hnsw->addPoint(data.data() + d * i, i); + } + + // test searchKnnCloserFirst of BruteforceSearch + for (size_t j = 0; j < nq; ++j) { + const void* p = query.data() + j * d; + auto gd = alg_brute->searchKnn(p, k); + auto res = alg_brute->searchKnnCloserFirst(p, k); + assert(gd.size() == res.size()); + size_t t = gd.size(); + while (!gd.empty()) { + assert(gd.top() == res[--t]); + gd.pop(); + } + } + for (size_t j = 0; j < nq; ++j) { + const void* p = query.data() + j * d; + auto gd = alg_hnsw->searchKnn(p, k); + auto res = alg_hnsw->searchKnnCloserFirst(p, k); + assert(gd.size() == res.size()); + size_t t = gd.size(); + while (!gd.empty()) { + assert(gd.top() == res[--t]); + gd.pop(); + } + } + + delete alg_brute; + delete alg_hnsw; +} + +} // namespace + +int main() { + std::cout << "Testing ..." << std::endl; + test(); + std::cout << "Test ok" << std::endl; + + return 0; +} diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 5b1bd655..24260400 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -111,24 +111,6 @@ namespace hnswlib { return topResults; }; - template - std::vector> - searchKnn(const void* query_data, size_t k, Comp comp) { - std::vector> result; - if (cur_element_count == 0) return result; - - auto ret = searchKnn(query_data, k); - - while (!ret.empty()) { - result.push_back(ret.top()); - ret.pop(); - } - - std::sort(result.begin(), result.end(), comp); - - return result; - } - void saveIndex(const std::string &location) { std::ofstream output(location, std::ios::binary); std::streampos position; diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 13df46cb..025b55c1 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -1157,19 +1157,6 @@ namespace hnswlib { return result; }; - int searchKnn(const void* x, - int k, labeltype* labels, dist_t* dists = nullptr) const override { - if (labels == nullptr) return -1; - auto ret = searchKnn(x, k); - for (int i = k - 1; i >= 0; --i) { - if (dists) - dists[i] = ret.top().first; - labels[i] = ret.top().second; - } - return 0; - } - - void checkIntegrity(){ int connections_checked=0; std::vector inbound_connections_num(cur_element_count,0); diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 6ef54495..9409c388 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -71,13 +71,34 @@ namespace hnswlib { public: virtual void addPoint(const void *datapoint, labeltype label)=0; virtual std::priority_queue> searchKnn(const void *, size_t) const = 0; - virtual int searchKnn(const void* x, - int k, labeltype* labels, dist_t* dists) const = 0; + + // Return k nearest neighbor in the order of closer fist + virtual std::vector> + searchKnnCloserFirst(const void* query_data, size_t k) const; + virtual void saveIndex(const std::string &location)=0; virtual ~AlgorithmInterface(){ } }; + template + std::vector> + AlgorithmInterface::searchKnnCloserFirst(const void* query_data, size_t k) const { + std::vector> result; + + // here searchKnn returns the result in the order of further first + auto ret = searchKnn(query_data, k); + { + size_t sz = ret.size(); + result.resize(sz); + while (!ret.empty()) { + result[--sz] = ret.top(); + ret.pop(); + } + } + + return result; + } } From 21c1ad76640201a7bc1d2753cd562dd5979e86e8 Mon Sep 17 00:00:00 2001 From: uestc-lfs Date: Sun, 13 Dec 2020 14:07:10 +0800 Subject: [PATCH 47/58] minor fix --- CMakeLists.txt | 2 ++ hnswlib/hnswalg.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ebee6e6c..31935e0e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,4 +23,6 @@ endif() add_executable(test_updates examples/updates_test.cpp) +add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp) + target_link_libraries(main sift_test) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 025b55c1..a2f72dc7 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -9,8 +9,6 @@ #include #include -#include - namespace hnswlib { typedef unsigned int tableint; typedef unsigned int linklistsizeint; From 21b908fd74acc4223c8dce4fa4737441814c7eb9 Mon Sep 17 00:00:00 2001 From: Jisang Yoon Date: Mon, 4 Jan 2021 21:20:51 +0900 Subject: [PATCH 48/58] Update README.md - add another related project --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 559c5dfd..8a00115b 100644 --- a/README.md +++ b/README.md @@ -204,6 +204,7 @@ https://github.com/dbaranchuk/ivf-hnsw * Java implementation: https://github.com/jelmerk/hnswlib * Java bindings using Java Native Access: https://github.com/stepstone-tech/hnswlib-jna * .Net implementation: https://github.com/microsoft/HNSW.Net +* CUDA implementation: https://github.com/js1010/cuhnsw ### Contributing to the repository Contributions are highly welcome! From d2e5a186772fa3d0f73a8a1b57e389a43cd26edf Mon Sep 17 00:00:00 2001 From: Jisang Yoon Date: Mon, 4 Jan 2021 21:22:39 +0900 Subject: [PATCH 49/58] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8a00115b..52f7118b 100644 --- a/README.md +++ b/README.md @@ -203,8 +203,8 @@ https://github.com/dbaranchuk/ivf-hnsw * Python implementation (as a part of the clustering code by by Matteo Dell'Amico): https://github.com/matteodellamico/flexible-clustering * Java implementation: https://github.com/jelmerk/hnswlib * Java bindings using Java Native Access: https://github.com/stepstone-tech/hnswlib-jna -* .Net implementation: https://github.com/microsoft/HNSW.Net -* CUDA implementation: https://github.com/js1010/cuhnsw +* .Net implementation: https://github.com/microsoft/HNSW.Net +* CUDA implementation: https://github.com/js1010/cuhnsw ### Contributing to the repository Contributions are highly welcome! From 6ae02a525d477694c0bd03265ed5cabfa61ea690 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Wed, 6 Jan 2021 19:09:57 +0300 Subject: [PATCH 50/58] Run sift test from separate directory --- README.md | 6 ++++-- sift_1b.cpp | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index aab5690d..8575a681 100644 --- a/README.md +++ b/README.md @@ -242,13 +242,15 @@ Contributions are highly welcome! Please make pull requests against the `develop` branch. ### 200M SIFT test reproduction -To download and extract the bigann dataset: +To download and extract the bigann dataset (from root directory): ```bash python3 download_bigann.py ``` To compile: ```bash -cmake . +mkdir build +cd build +cmake .. make all ``` diff --git a/sift_1b.cpp b/sift_1b.cpp index 273c9828..2739490c 100644 --- a/sift_1b.cpp +++ b/sift_1b.cpp @@ -242,11 +242,11 @@ void sift_test1B() { size_t vecdim = 128; char path_index[1024]; char path_gt[1024]; - char *path_q = "bigann/bigann_query.bvecs"; - char *path_data = "bigann/bigann_base.bvecs"; + char *path_q = "../bigann/bigann_query.bvecs"; + char *path_data = "../bigann/bigann_base.bvecs"; sprintf(path_index, "sift1b_%dm_ef_%d_M_%d.bin", subset_size_milllions, efConstruction, M); - sprintf(path_gt, "bigann/gnd/idx_%dM.ivecs", subset_size_milllions); + sprintf(path_gt, "../bigann/gnd/idx_%dM.ivecs", subset_size_milllions); unsigned char *massb = new unsigned char[vecdim]; From 68b6257a08919ee5defa7c635b61b41eaf9431f3 Mon Sep 17 00:00:00 2001 From: Greg Roodt Date: Sun, 10 Jan 2021 20:56:11 +1100 Subject: [PATCH 51/58] PEP-517 support --- .travis.yml | 3 +-- README.md | 3 +-- pyproject.toml | 9 +++++++++ requirements.txt | 2 -- 4 files changed, 11 insertions(+), 6 deletions(-) create mode 100644 pyproject.toml delete mode 100644 requirements.txt diff --git a/.travis.yml b/.travis.yml index 893441e9..9f9494fb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,8 +30,7 @@ jobs: install: - | - pip install -r requirements.txt - python setup.py install + python -m pip install . script: - | diff --git a/README.md b/README.md index 89cce5ce..68729083 100644 --- a/README.md +++ b/README.md @@ -213,8 +213,7 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat You can install from sources: ```bash apt-get install -y python-setuptools python-pip -pip3 install pybind11 numpy setuptools -python3 setup.py install +python3 -m pip install . ``` or you can install via pip: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..e00b3fb8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel", + "numpy>=1.10.0", + "pybind11>=2.0", +] + +build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 81fbf192..00000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -numpy>=1.10.0 -pybind11>=2.0 \ No newline at end of file From e94c5dc2088c5cc5ebd7a94a23a5a70cf38352a5 Mon Sep 17 00:00:00 2001 From: Greg Roodt Date: Sun, 10 Jan 2021 21:36:02 +1100 Subject: [PATCH 52/58] Simplify include_dirs --- .gitignore | 15 ++++++++------- setup.py | 24 ++++++++++-------------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index c4045e98..d2cde965 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ -hnswlib.egg-info/ -build/ -dist/ -tmp/ -python_bindings/tests/__pycache__/ -*.pyd -hnswlib.cpython*.so +hnswlib.egg-info/ +build/ +dist/ +tmp/ +python_bindings/tests/__pycache__/ +*.pyd +hnswlib.cpython*.so +var/ diff --git a/setup.py b/setup.py index 002f3893..2c91c5cf 100644 --- a/setup.py +++ b/setup.py @@ -3,17 +3,25 @@ from setuptools.command.build_ext import build_ext import sys import setuptools +import pybind11 +import numpy as np __version__ = '0.4.0' + +include_dirs = [ + pybind11.get_include(), + np.get_include(), +] + # compatibility when run in python_bindings bindings_dir = 'python_bindings' if bindings_dir in os.path.basename(os.getcwd()): source_files = ['./bindings.cpp'] - include_dirs = ['../hnswlib/'] + include_dirs.extend(['../hnswlib/']) else: source_files = ['./python_bindings/bindings.cpp'] - include_dirs = ['./hnswlib/'] + include_dirs.extend(['./hnswlib/']) libraries = [] @@ -90,21 +98,9 @@ def build_extensions(self): elif ct == 'msvc': opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) - # extend include dirs here (don't assume numpy/pybind11 are installed when first run, since - # pip could have installed them as part of executing this script - import pybind11 - import numpy as np for ext in self.extensions: ext.extra_compile_args.extend(opts) ext.extra_link_args.extend(self.link_opts.get(ct, [])) - ext.include_dirs.extend([ - # Path to pybind11 headers - pybind11.get_include(), - pybind11.get_include(True), - - # Path to numpy headers - np.get_include() - ]) build_ext.build_extensions(self) From 467c98f03d49f5995c96a2a513bb6cf88e719f07 Mon Sep 17 00:00:00 2001 From: Greg Roodt Date: Thu, 14 Jan 2021 07:49:34 +1100 Subject: [PATCH 53/58] Remove deprecated `setup.py test` --- .travis.yml | 2 +- Makefile | 4 +- python_bindings/tests/bindings_test.py | 14 +- .../tests/bindings_test_getdata.py | 9 +- python_bindings/tests/bindings_test_labels.py | 240 +++++++++--------- python_bindings/tests/bindings_test_pickle.py | 55 ++-- python_bindings/tests/bindings_test_resize.py | 31 +-- setup.py | 10 +- 8 files changed, 176 insertions(+), 189 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9f9494fb..2c3c9960 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,4 +34,4 @@ install: script: - | - python setup.py test + python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py" diff --git a/Makefile b/Makefile index 792b246e..3e62928f 100644 --- a/Makefile +++ b/Makefile @@ -6,9 +6,9 @@ dist: python3 setup.py sdist test: - python3 setup.py test + python3 -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py" clean: rm -rf *.egg-info build dist tmp var tests/__pycache__ hnswlib.cpython*.so -.PHONY: dist \ No newline at end of file +.PHONY: dist diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index 009b2164..d718bc3b 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -1,11 +1,13 @@ import os import unittest +import numpy as np + +import hnswlib + class RandomSelfTestCase(unittest.TestCase): def testRandomSelf(self): - import hnswlib - import numpy as np dim = 16 num_elements = 10000 @@ -41,7 +43,7 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data1, k=1) - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) # Serializing and deleting the index: index_path = 'first_half.bin' @@ -61,10 +63,6 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data, k=1) - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) os.remove(index_path) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/python_bindings/tests/bindings_test_getdata.py b/python_bindings/tests/bindings_test_getdata.py index 3e234518..8655d7f8 100644 --- a/python_bindings/tests/bindings_test_getdata.py +++ b/python_bindings/tests/bindings_test_getdata.py @@ -1,11 +1,13 @@ import unittest +import numpy as np + +import hnswlib + class RandomSelfTestCase(unittest.TestCase): def testGettingItems(self): print("\n**** Getting the data by label test ****\n") - import hnswlib - import numpy as np dim = 16 num_elements = 10000 @@ -42,6 +44,3 @@ def testGettingItems(self): # After adding them, all labels should be retrievable returned_items = p.get_items(labels) self.assertSequenceEqual(data.tolist(), returned_items) - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/python_bindings/tests/bindings_test_labels.py b/python_bindings/tests/bindings_test_labels.py index e44b0988..5c13e198 100644 --- a/python_bindings/tests/bindings_test_labels.py +++ b/python_bindings/tests/bindings_test_labels.py @@ -1,131 +1,127 @@ import os import unittest +import numpy as np -class RandomSelfTestCase(unittest.TestCase): - def testRandomSelf(self): - for idx in range(16): - print("\n**** Index save-load test ****\n") - import hnswlib - import numpy as np - - np.random.seed(idx) - dim = 16 - num_elements = 10000 - - # Generating sample data - data = np.float32(np.random.random((num_elements, dim))) - - # Declaring index - p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip - - # Initing index - # max_elements - the maximum number of elements, should be known beforehand - # (probably will be made optional in the future) - # - # ef_construction - controls index search speed/build speed tradeoff - # M - is tightly connected with internal dimensionality of the data - # stronlgy affects the memory consumption - - p.init_index(max_elements = num_elements, ef_construction = 100, M = 16) - - # Controlling the recall by setting ef: - # higher ef leads to better accuracy, but slower search - p.set_ef(100) - - p.set_num_threads(4) # by default using all available cores - - # We split the data in two batches: - data1 = data[:num_elements // 2] - data2 = data[num_elements // 2:] - - print("Adding first batch of %d elements" % (len(data1))) - p.add_items(data1) - - # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data1, k=1) - - items=p.get_items(labels) - - # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) - - # Check that the returned element data is correct: - diff_with_gt_labels=np.mean(np.abs(data1-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) - - # Serializing and deleting the index. - # We need the part to check that serialization is working properly. - - index_path = 'first_half.bin' - print("Saving index to '%s'" % index_path) - p.save_index(index_path) - print("Saved. Deleting...") - del p - print("Deleted") - - print("\n**** Mark delete test ****\n") - # Reiniting, loading the index - print("Reiniting") - p = hnswlib.Index(space='l2', dim=dim) - - print("\nLoading index from '%s'\n" % index_path) - p.load_index(index_path) - p.set_ef(100) - - print("Adding the second batch of %d elements" % (len(data2))) - p.add_items(data2) - - # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data, k=1) - items=p.get_items(labels) - - # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) - - # Check that the returned element data is correct: - diff_with_gt_labels=np.mean(np.abs(data-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) # deleting index. - - # Checking that all labels are returned correctly: - sorted_labels=sorted(p.get_ids_list()) - self.assertEqual(np.sum(~np.asarray(sorted_labels)==np.asarray(range(num_elements))),0) - - # Delete data1 - labels1, _ = p.knn_query(data1, k=1) - - for l in labels1: - p.mark_deleted(l[0]) - labels2, _ = p.knn_query(data2, k=1) - items=p.get_items(labels2) - diff_with_gt_labels=np.mean(np.abs(data2-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-3) # console - - - labels1_after, _ = p.knn_query(data1, k=1) - for la in labels1_after: - for lb in labels1: - if la[0] == lb[0]: - self.assertTrue(False) - print("All the data in data1 are removed") +import hnswlib - # checking saving/loading index with elements marked as deleted - del_index_path = "with_deleted.bin" - p.save_index(del_index_path) - p = hnswlib.Index(space='l2', dim=dim) - p.load_index(del_index_path) - p.set_ef(100) - labels1_after, _ = p.knn_query(data1, k=1) - for la in labels1_after: - for lb in labels1: - if la[0] == lb[0]: - self.assertTrue(False) - - os.remove(index_path) - os.remove(del_index_path) +class RandomSelfTestCase(unittest.TestCase): + def testRandomSelf(self): + for idx in range(16): + print("\n**** Index save-load test ****\n") + np.random.seed(idx) + dim = 16 + num_elements = 10000 + # Generating sample data + data = np.float32(np.random.random((num_elements, dim))) -if __name__ == "__main__": - unittest.main() + # Declaring index + p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip + + # Initing index + # max_elements - the maximum number of elements, should be known beforehand + # (probably will be made optional in the future) + # + # ef_construction - controls index search speed/build speed tradeoff + # M - is tightly connected with internal dimensionality of the data + # stronlgy affects the memory consumption + + p.init_index(max_elements=num_elements, ef_construction=100, M=16) + + # Controlling the recall by setting ef: + # higher ef leads to better accuracy, but slower search + p.set_ef(100) + + p.set_num_threads(4) # by default using all available cores + + # We split the data in two batches: + data1 = data[:num_elements // 2] + data2 = data[num_elements // 2:] + + print("Adding first batch of %d elements" % (len(data1))) + p.add_items(data1) + + # Query the elements for themselves and measure recall: + labels, distances = p.knn_query(data1, k=1) + + items=p.get_items(labels) + + # Check the recall: + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) + + # Check that the returned element data is correct: + diff_with_gt_labels=np.mean(np.abs(data1-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) + + # Serializing and deleting the index. + # We need the part to check that serialization is working properly. + + index_path = 'first_half.bin' + print("Saving index to '%s'" % index_path) + p.save_index(index_path) + print("Saved. Deleting...") + del p + print("Deleted") + + print("\n**** Mark delete test ****\n") + # Reiniting, loading the index + print("Reiniting") + p = hnswlib.Index(space='l2', dim=dim) + + print("\nLoading index from '%s'\n" % index_path) + p.load_index(index_path) + p.set_ef(100) + + print("Adding the second batch of %d elements" % (len(data2))) + p.add_items(data2) + + # Query the elements for themselves and measure recall: + labels, distances = p.knn_query(data, k=1) + items=p.get_items(labels) + + # Check the recall: + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) + + # Check that the returned element data is correct: + diff_with_gt_labels=np.mean(np.abs(data-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # deleting index. + + # Checking that all labels are returned correctly: + sorted_labels=sorted(p.get_ids_list()) + self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0) + + # Delete data1 + labels1, _ = p.knn_query(data1, k=1) + + for l in labels1: + p.mark_deleted(l[0]) + labels2, _ = p.knn_query(data2, k=1) + items=p.get_items(labels2) + diff_with_gt_labels = np.mean(np.abs(data2-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-3) # console + + labels1_after, _ = p.knn_query(data1, k=1) + for la in labels1_after: + for lb in labels1: + if la[0] == lb[0]: + self.assertTrue(False) + print("All the data in data1 are removed") + + # checking saving/loading index with elements marked as deleted + del_index_path = "with_deleted.bin" + p.save_index(del_index_path) + p = hnswlib.Index(space='l2', dim=dim) + p.load_index(del_index_path) + p.set_ef(100) + + labels1_after, _ = p.knn_query(data1, k=1) + for la in labels1_after: + for lb in labels1: + if la[0] == lb[0]: + self.assertTrue(False) + + os.remove(index_path) + os.remove(del_index_path) diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py index 6c3a826a..3a42df2e 100644 --- a/python_bindings/tests/bindings_test_pickle.py +++ b/python_bindings/tests/bindings_test_pickle.py @@ -1,28 +1,30 @@ +import pickle import unittest import numpy as np + import hnswlib -import pickle def get_dist(metric, pt1, pt2): if metric == 'l2': return np.sum((pt1-pt2)**2) elif metric == 'ip': - return 1. - np.sum(np.multiply(pt1,pt2)) + return 1. - np.sum(np.multiply(pt1, pt2)) elif metric == 'cosine': - return 1. - np.sum(np.multiply(pt1,pt2)) / (np.sum(pt1**2) * np.sum(pt2**2))**.5 + return 1. - np.sum(np.multiply(pt1, pt2)) / (np.sum(pt1**2) * np.sum(pt2**2))**.5 + def brute_force_distances(metric, items, query_items, k): - dists=np.zeros((query_items.shape[0], items.shape[0])) + dists = np.zeros((query_items.shape[0], items.shape[0])) for ii in range(items.shape[0]): for jj in range(query_items.shape[0]): - dists[jj,ii]=get_dist(metric, items[ii, :], query_items[jj, :]) + dists[jj,ii] = get_dist(metric, items[ii, :], query_items[jj, :]) labels = np.argsort(dists, axis=1) # equivalent, but faster: np.argpartition(dists, range(k), axis=1) dists = np.sort(dists, axis=1) # equivalent, but faster: np.partition(dists, range(k), axis=1) - return labels[:,:k], dists[:,:k] + return labels[:, :k], dists[:, :k] def check_ann_results(self, metric, items, query_items, k, ann_l, ann_d, err_thresh=0, total_thresh=0, dists_thresh=0): @@ -36,14 +38,15 @@ def check_ann_results(self, metric, items, query_items, k, ann_l, ann_d, err_thr if err > err_thresh: err_total += 1 - self.assertLessEqual( err_total, total_thresh, f"Error: knn_query returned incorrect labels for {err_total} items (k={k})") + self.assertLessEqual(err_total, total_thresh, f"Error: knn_query returned incorrect labels for {err_total} items (k={k})") - wrong_dists=np.sum(((brute_d- ann_d)**2.)>1e-3) + wrong_dists = np.sum(((brute_d - ann_d)**2.) > 1e-3) if wrong_dists > 0: - dists_count=brute_d.shape[0]*brute_d.shape[1] + dists_count = brute_d.shape[0]*brute_d.shape[1] print(f"Warning: {wrong_dists} ann distance values are different from brute-force values (total # of values={dists_count}, dists_thresh={dists_thresh})") - self.assertLessEqual( wrong_dists, dists_thresh, msg=f"Error: {wrong_dists} ann distance values are different from brute-force values") + self.assertLessEqual(wrong_dists, dists_thresh, msg=f"Error: {wrong_dists} ann distance values are different from brute-force values") + def test_space_main(self, space, dim): @@ -55,16 +58,16 @@ def test_space_main(self, space, dim): p = hnswlib.Index(space=space, dim=dim) # possible options are l2, cosine or ip print(f"Running pickle tests for {p}") - p.num_threads=self.num_threads # by default using all available cores + p.num_threads = self.num_threads # by default using all available cores - p0=pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index - p.init_index(max_elements = self.num_elements, ef_construction = self.ef_construction, M = self.M) - p0.init_index(max_elements = self.num_elements, ef_construction = self.ef_construction, M = self.M) + p0 = pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index + p.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M) + p0.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M) - p.ef=self.ef - p0.ef=self.ef + p.ef = self.ef + p0.ef = self.ef - p1=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + p1 = pickle.loads(pickle.dumps(p)) ### pickle Index before adding items ### add items to ann index p,p0,p1 p.add_items(data) @@ -78,7 +81,7 @@ def test_space_main(self, space, dim): self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same") ### Test if returned distances are same - l, d = p.knn_query(test_data, k=self.k) + l, d = p.knn_query(test_data, k=self.k) l0, d0 = p0.knn_query(test_data, k=self.k) l1, d1 = p1.knn_query(test_data, k=self.k) l2, d2 = p2.knn_query(test_data, k=self.k) @@ -90,9 +93,9 @@ def test_space_main(self, space, dim): ### check if ann results match brute-force search ### allow for 2 labels to be missing from ann results check_ann_results(self, space, data, test_data, self.k, l, d, - err_thresh = self.label_err_thresh, - total_thresh = self.item_err_thresh, - dists_thresh = self.dists_err_thresh) + err_thresh=self.label_err_thresh, + total_thresh=self.item_err_thresh, + dists_thresh=self.dists_err_thresh) check_ann_results(self, space, data, test_data, self.k, l2, d2, err_thresh=self.label_err_thresh, @@ -118,7 +121,6 @@ def test_space_main(self, space, dim): self.assertEqual(p2.ef_construction, self.ef_construction, "incorrect value of p2.ef_construction") - class PickleUnitTests(unittest.TestCase): def setUp(self): @@ -133,10 +135,10 @@ def setUp(self): self.num_threads = 4 self.k = 25 - self.label_err_thresh=5 ### max number of missing labels allowed per test item - self.item_err_thresh=5 ### max number of items allowed with incorrect labels + self.label_err_thresh = 5 ### max number of missing labels allowed per test item + self.item_err_thresh = 5 ### max number of items allowed with incorrect labels - self.dists_err_thresh=50 ### for two matrices, d1 and d2, dists_err_thresh controls max + self.dists_err_thresh = 50 ### for two matrices, d1 and d2, dists_err_thresh controls max ### number of value pairs that are allowed to be different in d1 and d2 ### i.e., number of values that are (d1-d2)**2>1e-3 @@ -148,6 +150,3 @@ def test_l2_space(self): def test_cosine_space(self): test_space_main(self, 'cosine', 512) - -if __name__ == "__main__": - unittest.main() diff --git a/python_bindings/tests/bindings_test_resize.py b/python_bindings/tests/bindings_test_resize.py index 9411af64..3c4e3e4f 100644 --- a/python_bindings/tests/bindings_test_resize.py +++ b/python_bindings/tests/bindings_test_resize.py @@ -1,12 +1,15 @@ import unittest +import numpy as np + +import hnswlib + class RandomSelfTestCase(unittest.TestCase): def testRandomSelf(self): for idx in range(16): print("\n**** Index resize test ****\n") - import hnswlib - import numpy as np + np.random.seed(idx) dim = 16 num_elements = 10000 @@ -25,7 +28,7 @@ def testRandomSelf(self): # M - is tightly connected with internal dimensionality of the data # stronlgy affects the memory consumption - p.init_index(max_elements = num_elements//2, ef_construction = 100, M = 16) + p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search @@ -43,20 +46,18 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data1, k=1) - items=p.get_items(list(range(len(data1)))) + items = p.get_items(list(range(len(data1)))) # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) # Check that the returned element data is correct: - diff_with_gt_labels=np.max(np.abs(data1-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) + diff_with_gt_labels = np.max(np.abs(data1-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) print("Resizing the index") p.resize_index(num_elements) - - print("Adding the second batch of %d elements" % (len(data2))) p.add_items(data2) @@ -65,18 +66,12 @@ def testRandomSelf(self): items=p.get_items(list(range(num_elements))) # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) # Check that the returned element data is correct: diff_with_gt_labels=np.max(np.abs(data-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # Checking that all labels are returned correcly: sorted_labels=sorted(p.get_ids_list()) - self.assertEqual(np.sum(~np.asarray(sorted_labels)==np.asarray(range(num_elements))),0) - - - - -if __name__ == "__main__": - unittest.main() + self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0) diff --git a/setup.py b/setup.py index 2c91c5cf..ac587cba 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,11 @@ import os -from setuptools import setup, Extension -from setuptools.command.build_ext import build_ext import sys -import setuptools -import pybind11 + import numpy as np +import pybind11 +import setuptools +from setuptools import Extension, setup +from setuptools.command.build_ext import build_ext __version__ = '0.4.0' @@ -115,6 +116,5 @@ def build_extensions(self): ext_modules=ext_modules, install_requires=['pybind11>=2.0', 'numpy'], cmdclass={'build_ext': BuildExt}, - test_suite="python_bindings.tests", zip_safe=False, ) From 2248ab4ab3311cf6677628bb1a62bb90bee4ffd3 Mon Sep 17 00:00:00 2001 From: Greg Roodt Date: Thu, 14 Jan 2021 08:14:46 +1100 Subject: [PATCH 54/58] pybind11 isn't needed at runtime, only build time --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ac587cba..929bc211 100644 --- a/setup.py +++ b/setup.py @@ -114,7 +114,7 @@ def build_extensions(self): url='https://github.com/yurymalkov/hnsw', long_description="""hnsw""", ext_modules=ext_modules, - install_requires=['pybind11>=2.0', 'numpy'], + install_requires=['numpy'], cmdclass={'build_ext': BuildExt}, zip_safe=False, ) From 8fe02c0ddc606327d8aa77d54b3a9ddbc0eebaac Mon Sep 17 00:00:00 2001 From: Greg Roodt Date: Fri, 15 Jan 2021 15:04:27 +1100 Subject: [PATCH 55/58] Support for packaging sdist --- Makefile | 3 ++- README.md | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 3e62928f..b5e8fda9 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,8 @@ pypi: dist dist: -rm dist/* - python3 setup.py sdist + pip install build + python3 -m build --sdist test: python3 -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py" diff --git a/README.md b/README.md index 68729083..79b4085b 100644 --- a/README.md +++ b/README.md @@ -213,7 +213,9 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat You can install from sources: ```bash apt-get install -y python-setuptools python-pip -python3 -m pip install . +git clone git@github.com:nmslib/hnswlib.git +cd hnswlib +pip install . ``` or you can install via pip: From 73134a7521941d22a85a381886c8a6fac776588a Mon Sep 17 00:00:00 2001 From: Greg Roodt Date: Sat, 16 Jan 2021 08:21:23 +1100 Subject: [PATCH 56/58] https git clone in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 79b4085b..90105f0d 100644 --- a/README.md +++ b/README.md @@ -213,7 +213,7 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat You can install from sources: ```bash apt-get install -y python-setuptools python-pip -git clone git@github.com:nmslib/hnswlib.git +git clone https://github.com/nmslib/hnswlib.git cd hnswlib pip install . ``` From a9153e960e0f2f72533a0f35feaa413bcd992ad6 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Sat, 16 Jan 2021 16:54:08 +0300 Subject: [PATCH 57/58] Add license file to pypi package --- MANIFEST.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 5a480e4f..2d71d12e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1,2 @@ -include hnswlib/*.h \ No newline at end of file +include hnswlib/*.h +include LICENSE From 14697028fc0d52b9770377a72ed63a026057bb39 Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Sun, 24 Jan 2021 23:14:45 -0800 Subject: [PATCH 58/58] bump version --- README.md | 15 +++++++++------ setup.py | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 90105f0d..8d139fdc 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,10 @@ Header-only C++ HNSW implementation with python bindings. Paper's code for the H **NEWS:** -* **Thanks to Apoorv Sharma [@apoorv-sharma](https://github.com/apoorv-sharma), hnswlib now supports true element updates (the interface remained the same, but when you the perfromance/memory should not degrade as you update the element embeddings).** + +* **hnswlib is now 0.5.0. Added support for pickling indices, support for PEP-517 and PEP-518 building, small speedups, bug and documentation fixes. Many thanks to [@dbespalov](https://github.com/dbespalov), [@dyashuni](https://github.com/dyashuni), [@groodt](https://github.com/groodt),[@uestc-lfs](https://github.com/uestc-lfs), [@vinnitu](https://github.com/vinnitu), [@fabiencastan](https://github.com/fabiencastan), [@JinHai-CN](https://github.com/JinHai-CN), [@js1010](https://github.com/js1010)!** + +* **Thanks to Apoorv Sharma [@apoorv-sharma](https://github.com/apoorv-sharma), hnswlib now supports true element updates (the interface remained the same, but when you the performance/memory should not degrade as you update the element embeddings).** * **Thanks to Dmitry [@2ooom](https://github.com/2ooom), hnswlib got a boost in performance for vector dimensions that are not multiple of 4** @@ -115,7 +118,7 @@ data_labels = np.arange(num_elements) # Declaring index p = hnswlib.Index(space = 'l2', dim = dim) # possible options are l2, cosine or ip -# Initing index - the maximum number of elements should be known beforehand +# Initializing index - the maximum number of elements should be known beforehand p.init_index(max_elements = num_elements, ef_construction = 200, M = 16) # Element insertion (can be called several times): @@ -129,7 +132,7 @@ labels, distances = p.knn_query(data, k = 1) # Index objects support pickling # WARNING: serialization via pickle.dumps(p) or p.__getstate__() is NOT thread-safe with p.add_items method! -# Note: ef parameter is included in serialization; random number generator is initialized with random_seeed on Index load +# Note: ef parameter is included in serialization; random number generator is initialized with random_seed on Index load p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle round-trip ### Index parameters are exposed as class properties: @@ -158,7 +161,7 @@ data2 = data[num_elements // 2:] # Declaring index p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip -# Initing index +# Initializing index # max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded # during insertion of an element. # The capacity can be increased by saving/loading the index, see below. @@ -192,7 +195,7 @@ print("Saving index to '%s'" % index_path) p.save_index("first_half.bin") del p -# Reiniting, loading the index +# Re-initializing, loading the index p = hnswlib.Index(space='l2', dim=dim) # the space can be changed - keeps the data, alters the distance function. print("\nLoading index from 'first_half.bin'\n") @@ -261,7 +264,7 @@ To run the test on 200M SIFT subset: ./main ``` -The size of the bigann subset (in millions) is controlled by the variable **subset_size_milllions** hardcoded in **sift_1b.cpp**. +The size of the BigANN subset (in millions) is controlled by the variable **subset_size_millions** hardcoded in **sift_1b.cpp**. ### Updates test To generate testing data (from root directory): diff --git a/setup.py b/setup.py index 929bc211..15665f31 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ from setuptools import Extension, setup from setuptools.command.build_ext import build_ext -__version__ = '0.4.0' +__version__ = '0.5.0' include_dirs = [