diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..d2cde965 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +hnswlib.egg-info/ +build/ +dist/ +tmp/ +python_bindings/tests/__pycache__/ +*.pyd +hnswlib.cpython*.so +var/ diff --git a/.travis.yml b/.travis.yml index 6b194926..2c3c9960 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,16 +1,37 @@ language: python -matrix: +jobs: include: - - python: 3.6 - - python: 3.7 + - name: Linux Python 3.6 + os: linux + python: 3.6 + + - name: Linux Python 3.7 + os: linux + python: 3.7 + + - name: Windows Python 3.6 + os: windows + language: shell # 'language: python' is an error on Travis CI Windows + before_install: + - choco install python --version 3.6.0 + - python -m pip install --upgrade pip + - python --version + env: PATH=/c/Python36:/c/Python36/Scripts:$PATH + + - name: Windows Python 3.7 + os: windows + language: shell # 'language: python' is an error on Travis CI Windows + before_install: + - choco install python --version 3.7.0 + - python -m pip install --upgrade pip + - python --version + env: PATH=/c/Python37:/c/Python37/Scripts:$PATH + install: - | - cd python_bindings - pip install -r requirements.txt - python setup.py install + python -m pip install . script: - | - cd python_bindings - python setup.py test + python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py" diff --git a/CMakeLists.txt b/CMakeLists.txt index ebee6e6c..31935e0e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,4 +23,6 @@ endif() add_executable(test_updates examples/updates_test.cpp) +add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp) + target_link_libraries(main sift_test) diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..2d71d12e --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include hnswlib/*.h +include LICENSE diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..b5e8fda9 --- /dev/null +++ b/Makefile @@ -0,0 +1,15 @@ +pypi: dist + twine upload dist/* + +dist: + -rm dist/* + pip install build + python3 -m build --sdist + +test: + python3 -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py" + +clean: + rm -rf *.egg-info build dist tmp var tests/__pycache__ hnswlib.cpython*.so + +.PHONY: dist diff --git a/README.md b/README.md index 559c5dfd..8d139fdc 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,12 @@ Header-only C++ HNSW implementation with python bindings. Paper's code for the H **NEWS:** -* **Thanks to Apoorv Sharma [@apoorv-sharma](https://github.com/apoorv-sharma), hnswlib now supports true element updates (the interface remained the same, but when you the perfromance/memory should not degrade as you update the element embeddinds).** -* **Thanks to Dmitry [@2ooom](https://github.com/2ooom), hnswlib got a boost in performance for vector dimensions that are not mutiple of 4** +* **hnswlib is now 0.5.0. Added support for pickling indices, support for PEP-517 and PEP-518 building, small speedups, bug and documentation fixes. Many thanks to [@dbespalov](https://github.com/dbespalov), [@dyashuni](https://github.com/dyashuni), [@groodt](https://github.com/groodt),[@uestc-lfs](https://github.com/uestc-lfs), [@vinnitu](https://github.com/vinnitu), [@fabiencastan](https://github.com/fabiencastan), [@JinHai-CN](https://github.com/JinHai-CN), [@js1010](https://github.com/js1010)!** + +* **Thanks to Apoorv Sharma [@apoorv-sharma](https://github.com/apoorv-sharma), hnswlib now supports true element updates (the interface remained the same, but when you the performance/memory should not degrade as you update the element embeddings).** + +* **Thanks to Dmitry [@2ooom](https://github.com/2ooom), hnswlib got a boost in performance for vector dimensions that are not multiple of 4** * **Thanks to Louis Abraham ([@louisabraham](https://github.com/louisabraham)) hnswlib can now be installed via pip!** @@ -37,7 +40,7 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib. #### Short API description * `hnswlib.Index(space, dim)` creates a non-initialized index an HNSW in space `space` with integer dimension `dim`. -Index methods: +`hnswlib.Index` methods: * `init_index(max_elements, ef_construction = 200, M = 16, random_seed = 100)` initializes the index from with no elements. * `max_elements` defines the maximum number of elements that can be stored in the structure(can be increased/shrunk). * `ef_construction` defines a construction time/accuracy trade-off (see [ALGO_PARAMS.md](ALGO_PARAMS.md)). @@ -49,14 +52,14 @@ Index methods: * `data_labels` specifies the labels for the data. If index already has the elements with the same labels, their features will be updated. Note that update procedure is slower than insertion of a new element, but more memory- and query-efficient. * Thread-safe with other `add_items` calls, but not with `knn_query`. -* `mark_deleted(data_label)` - marks the element as deleted, so it will be ommited from search results. +* `mark_deleted(data_label)` - marks the element as deleted, so it will be omitted from search results. * `resize_index(new_size)` - changes the maximum capacity of the index. Not thread safe with `add_items` and `knn_query`. * `set_ef(ef)` - sets the query time accuracy/speed trade-off, defined by the `ef` parameter ( [ALGO_PARAMS.md](ALGO_PARAMS.md)). Note that the parameter is currently not saved along with the index, so you need to set it manually after loading. -* `knn_query(data, k = 1, num_threads = -1)` make a batch query for `k` closests elements for each element of the +* `knn_query(data, k = 1, num_threads = -1)` make a batch query for `k` closest elements for each element of the * `data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`). * `num_threads` sets the number of cpu threads to use (-1 means use default). * Thread-safe with other `knn_query` calls, but not with `add_items`. @@ -76,14 +79,34 @@ Index methods: * `get_current_count()` - returns the current number of element stored in the index - - +Read-only properties of `hnswlib.Index` class: + +* `space` - name of the space (can be one of "l2", "ip", or "cosine"). + +* `dim` - dimensionality of the space. + +* `M` - parameter that defines the maximum number of outgoing connections in the graph. + +* `ef_construction` - parameter that controls speed/accuracy trade-off during the index construction. + +* `max_elements` - current capacity of the index. Equivalent to `p.get_max_elements()`. + +* `element_count` - number of items in the index. Equivalent to `p.get_current_count()`. + +Properties of `hnswlib.Index` that support reading and writing: + +* `ef` - parameter controlling query time/accuracy trade-off. + +* `num_threads` - default number of threads to use in `add_items` or `knn_query`. Note that calling `p.set_num_threads(3)` is equivalent to `p.num_threads=3`. + + #### Python bindings examples ```python import hnswlib import numpy as np +import pickle dim = 128 num_elements = 10000 @@ -95,7 +118,7 @@ data_labels = np.arange(num_elements) # Declaring index p = hnswlib.Index(space = 'l2', dim = dim) # possible options are l2, cosine or ip -# Initing index - the maximum number of elements should be known beforehand +# Initializing index - the maximum number of elements should be known beforehand p.init_index(max_elements = num_elements, ef_construction = 200, M = 16) # Element insertion (can be called several times): @@ -106,6 +129,18 @@ p.set_ef(50) # ef should always be > k # Query dataset, k - number of closest elements (returns 2 numpy arrays) labels, distances = p.knn_query(data, k = 1) + +# Index objects support pickling +# WARNING: serialization via pickle.dumps(p) or p.__getstate__() is NOT thread-safe with p.add_items method! +# Note: ef parameter is included in serialization; random number generator is initialized with random_seed on Index load +p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle round-trip + +### Index parameters are exposed as class properties: +print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}") +print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}") +print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}") +print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}") + ``` An example with updates after serialization/deserialization: @@ -126,7 +161,7 @@ data2 = data[num_elements // 2:] # Declaring index p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip -# Initing index +# Initializing index # max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded # during insertion of an element. # The capacity can be increased by saving/loading the index, see below. @@ -160,7 +195,7 @@ print("Saving index to '%s'" % index_path) p.save_index("first_half.bin") del p -# Reiniting, loading the index +# Re-initializing, loading the index p = hnswlib.Index(space='l2', dim=dim) # the space can be changed - keeps the data, alters the distance function. print("\nLoading index from 'first_half.bin'\n") @@ -181,9 +216,9 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat You can install from sources: ```bash apt-get install -y python-setuptools python-pip -pip3 install pybind11 numpy setuptools -cd python_bindings -python3 setup.py install +git clone https://github.com/nmslib/hnswlib.git +cd hnswlib +pip install . ``` or you can install via pip: @@ -191,7 +226,7 @@ or you can install via pip: ### Other implementations * Non-metric space library (nmslib) - main library(python, C++), supports exotic distances: https://github.com/nmslib/nmslib -* Faiss libary by facebook, uses own HNSW implementation for coarse quantization (python, C++): +* Faiss library by facebook, uses own HNSW implementation for coarse quantization (python, C++): https://github.com/facebookresearch/faiss * Code for the paper ["Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors"](https://arxiv.org/abs/1802.02422) @@ -203,7 +238,8 @@ https://github.com/dbaranchuk/ivf-hnsw * Python implementation (as a part of the clustering code by by Matteo Dell'Amico): https://github.com/matteodellamico/flexible-clustering * Java implementation: https://github.com/jelmerk/hnswlib * Java bindings using Java Native Access: https://github.com/stepstone-tech/hnswlib-jna -* .Net implementation: https://github.com/microsoft/HNSW.Net +* .Net implementation: https://github.com/microsoft/HNSW.Net +* CUDA implementation: https://github.com/js1010/cuhnsw ### Contributing to the repository Contributions are highly welcome! @@ -211,13 +247,15 @@ Contributions are highly welcome! Please make pull requests against the `develop` branch. ### 200M SIFT test reproduction -To download and extract the bigann dataset: +To download and extract the bigann dataset (from root directory): ```bash python3 download_bigann.py ``` To compile: ```bash -cmake . +mkdir build +cd build +cmake .. make all ``` @@ -226,7 +264,7 @@ To run the test on 200M SIFT subset: ./main ``` -The size of the bigann subset (in millions) is controlled by the variable **subset_size_milllions** hardcoded in **sift_1b.cpp**. +The size of the BigANN subset (in millions) is controlled by the variable **subset_size_millions** hardcoded in **sift_1b.cpp**. ### Updates test To generate testing data (from root directory): diff --git a/examples/searchKnnCloserFirst_test.cpp b/examples/searchKnnCloserFirst_test.cpp new file mode 100644 index 00000000..cc1392c8 --- /dev/null +++ b/examples/searchKnnCloserFirst_test.cpp @@ -0,0 +1,84 @@ +// This is a test file for testing the interface +// >>> virtual std::vector> +// >>> searchKnnCloserFirst(const void* query_data, size_t k) const; +// of class AlgorithmInterface + +#include "../hnswlib/hnswlib.h" + +#include + +#include +#include + +namespace +{ + +using idx_t = hnswlib::labeltype; + +void test() { + int d = 4; + idx_t n = 100; + idx_t nq = 10; + size_t k = 10; + + std::vector data(n * d); + std::vector query(nq * d); + + std::mt19937 rng; + rng.seed(47); + std::uniform_real_distribution<> distrib; + + for (idx_t i = 0; i < n * d; ++i) { + data[i] = distrib(rng); + } + for (idx_t i = 0; i < nq * d; ++i) { + query[i] = distrib(rng); + } + + + hnswlib::L2Space space(d); + hnswlib::AlgorithmInterface* alg_brute = new hnswlib::BruteforceSearch(&space, 2 * n); + hnswlib::AlgorithmInterface* alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n); + + for (size_t i = 0; i < n; ++i) { + alg_brute->addPoint(data.data() + d * i, i); + alg_hnsw->addPoint(data.data() + d * i, i); + } + + // test searchKnnCloserFirst of BruteforceSearch + for (size_t j = 0; j < nq; ++j) { + const void* p = query.data() + j * d; + auto gd = alg_brute->searchKnn(p, k); + auto res = alg_brute->searchKnnCloserFirst(p, k); + assert(gd.size() == res.size()); + size_t t = gd.size(); + while (!gd.empty()) { + assert(gd.top() == res[--t]); + gd.pop(); + } + } + for (size_t j = 0; j < nq; ++j) { + const void* p = query.data() + j * d; + auto gd = alg_hnsw->searchKnn(p, k); + auto res = alg_hnsw->searchKnnCloserFirst(p, k); + assert(gd.size() == res.size()); + size_t t = gd.size(); + while (!gd.empty()) { + assert(gd.top() == res[--t]); + gd.pop(); + } + } + + delete alg_brute; + delete alg_hnsw; +} + +} // namespace + +int main() { + std::cout << "Testing ..." << std::endl; + test(); + std::cout << "Test ok" << std::endl; + + return 0; +} diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 5b1bd655..24260400 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -111,24 +111,6 @@ namespace hnswlib { return topResults; }; - template - std::vector> - searchKnn(const void* query_data, size_t k, Comp comp) { - std::vector> result; - if (cur_element_count == 0) return result; - - auto ret = searchKnn(query_data, k); - - while (!ret.empty()) { - result.push_back(ret.top()); - ret.pop(); - } - - std::sort(result.begin(), result.end(), comp); - - return result; - } - void saveIndex(const std::string &location) { std::ofstream output(location, std::ios::binary); std::streampos position; diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 97bdcd18..a2f72dc7 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -5,10 +5,10 @@ #include #include #include +#include #include #include - namespace hnswlib { typedef unsigned int tableint; typedef unsigned int linklistsizeint; @@ -26,7 +26,7 @@ namespace hnswlib { } HierarchicalNSW(SpaceInterface *s, size_t max_elements, size_t M = 16, size_t ef_construction = 200, size_t random_seed = 100) : - link_list_locks_(max_elements), element_levels_(max_elements), link_list_update_locks_(max_update_element_locks) { + link_list_locks_(max_elements), link_list_update_locks_(max_update_element_locks), element_levels_(max_elements) { max_elements_ = max_elements; has_deletions_=false; @@ -406,7 +406,7 @@ namespace hnswlib { top_candidates.pop(); } - tableint next_closest_entry_point = selectedNeighbors[0]; + tableint next_closest_entry_point = selectedNeighbors.back(); { linklistsizeint *ll_cur; @@ -636,7 +636,6 @@ namespace hnswlib { if (!input.is_open()) throw std::runtime_error("Cannot open file"); - // get file size: input.seekg(0,input.end); std::streampos total_filesize=input.tellg(); @@ -868,8 +867,8 @@ namespace hnswlib { // continue; std::priority_queue, std::vector>, CompareByFirst> candidates; - int size = sCand.find(neigh) == sCand.end() ? sCand.size() : sCand.size() - 1; - int elementsToKeep = std::min(int(ef_construction_), size); + size_t size = sCand.find(neigh) == sCand.end() ? sCand.size() : sCand.size() - 1; // sCand guaranteed to have size >= 1 + size_t elementsToKeep = std::min(ef_construction_, size); for (auto&& cand : sCand) { if (cand == neigh) continue; @@ -892,7 +891,7 @@ namespace hnswlib { std::unique_lock lock(link_list_locks_[neigh]); linklistsizeint *ll_cur; ll_cur = get_linklist_at_level(neigh, layer); - int candSize = candidates.size(); + size_t candSize = candidates.size(); setListCount(ll_cur, candSize); tableint *data = (tableint *) (ll_cur + 1); for (size_t idx = 0; idx < candSize; idx++) { @@ -1136,7 +1135,7 @@ namespace hnswlib { } std::priority_queue, std::vector>, CompareByFirst> top_candidates; - if (has_deletions_) { + if (has_deletions_) { top_candidates=searchBaseLayerST( currObj, query_data, std::max(ef_, k)); } @@ -1156,24 +1155,6 @@ namespace hnswlib { return result; }; - template - std::vector> - searchKnn(const void* query_data, size_t k, Comp comp) { - std::vector> result; - if (cur_element_count == 0) return result; - - auto ret = searchKnn(query_data, k); - - while (!ret.empty()) { - result.push_back(ret.top()); - ret.pop(); - } - - std::sort(result.begin(), result.end(), comp); - - return result; - } - void checkIntegrity(){ int connections_checked=0; std::vector inbound_connections_num(cur_element_count,0); @@ -1185,19 +1166,19 @@ namespace hnswlib { std::unordered_set s; for (int j=0; j 0); - assert(data[j] < cur_element_count); + assert(data[j] < cur_element_count); assert (data[j] != i); inbound_connections_num[data[j]]++; s.insert(data[j]); connections_checked++; - + } assert(s.size() == size); } } if(cur_element_count > 1){ int min1=inbound_connections_num[0], max1=inbound_connections_num[0]; - for(int i=0; i < cur_element_count; i++){ + for(int i=0; i < cur_element_count; i++){ assert(inbound_connections_num[i] > 0); min1=std::min(inbound_connections_num[i],min1); max1=std::max(inbound_connections_num[i],max1); @@ -1205,7 +1186,7 @@ namespace hnswlib { std::cout << "Min inbound: " << min1 << ", Max inbound:" << max1 << "\n"; } std::cout << "integrity ok, checked " << connections_checked << " connections\n"; - + } }; diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index c26f80b5..9409c388 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -71,14 +71,34 @@ namespace hnswlib { public: virtual void addPoint(const void *datapoint, labeltype label)=0; virtual std::priority_queue> searchKnn(const void *, size_t) const = 0; - template - std::vector> searchKnn(const void*, size_t, Comp) { - } + + // Return k nearest neighbor in the order of closer fist + virtual std::vector> + searchKnnCloserFirst(const void* query_data, size_t k) const; + virtual void saveIndex(const std::string &location)=0; virtual ~AlgorithmInterface(){ } }; + template + std::vector> + AlgorithmInterface::searchKnnCloserFirst(const void* query_data, size_t k) const { + std::vector> result; + + // here searchKnn returns the result in the order of further first + auto ret = searchKnn(query_data, k); + { + size_t sz = ret.size(); + result.resize(sz); + while (!ret.empty()) { + result[--sz] = ret.top(); + ret.pop(); + } + } + + return result; + } } diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h index bc00af72..e86e13b0 100644 --- a/hnswlib/space_l2.h +++ b/hnswlib/space_l2.h @@ -204,7 +204,7 @@ namespace hnswlib { }; static int - L2SqrI(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) { + L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) { size_t qty = *((size_t *) qty_ptr); int res = 0; @@ -226,12 +226,23 @@ namespace hnswlib { res += ((*a) - (*b)) * ((*a) - (*b)); a++; b++; + } + return (res); + } + static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) { + size_t qty = *((size_t*)qty_ptr); + int res = 0; + unsigned char* a = (unsigned char*)pVect1; + unsigned char* b = (unsigned char*)pVect2; + for(size_t i = 0; i < qty; i++) + { + res += ((*a) - (*b)) * ((*a) - (*b)); + a++; + b++; } - return (res); - } class L2SpaceI : public SpaceInterface { @@ -241,7 +252,12 @@ namespace hnswlib { size_t dim_; public: L2SpaceI(size_t dim) { - fstdistfunc_ = L2SqrI; + if(dim % 4 == 0) { + fstdistfunc_ = L2SqrI4x; + } + else { + fstdistfunc_ = L2SqrI; + } dim_ = dim; data_size_ = dim * sizeof(unsigned char); } diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..e00b3fb8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel", + "numpy>=1.10.0", + "pybind11>=2.0", +] + +build-backend = "setuptools.build_meta" diff --git a/python_bindings/MANIFEST.in b/python_bindings/MANIFEST.in deleted file mode 100644 index 5a480e4f..00000000 --- a/python_bindings/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -include hnswlib/*.h \ No newline at end of file diff --git a/python_bindings/Makefile b/python_bindings/Makefile deleted file mode 100644 index 02ec523b..00000000 --- a/python_bindings/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -pypi: dist - twine upload dist/* - -dist: - -rm dist/* - python3 setup.py sdist - -test: - python3 setup.py test - -clean: - rm -rf *.egg-info build dist var first_half.bin tests/__pycache__ hnswlib.cpython-36m-darwin.so - -.PHONY: dist \ No newline at end of file diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 1b88ca23..87e0c054 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -2,18 +2,21 @@ #include #include #include -#include "hnswlib/hnswlib.h" +#include "hnswlib.h" #include #include +#include +#include namespace py = pybind11; +using namespace pybind11::literals; // needed to bring in _a literal /* * replacement for the openmp '#pragma omp parallel for' directive * only handles a subset of functionality (no reductions etc) * Process ids from start (inclusive) to end (EXCLUSIVE) * - * The method is borrowed from nmslib + * The method is borrowed from nmslib */ template inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn) { @@ -71,27 +74,58 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn } + inline void assert_true(bool expr, const std::string & msg) { + if (expr == false) + throw std::runtime_error("Unpickle Error: "+msg); + return; + } + + + template class Index { public: - Index(const std::string &space_name, const int dim) : - space_name(space_name), dim(dim) { - normalize=false; - if(space_name=="l2") { - l2space = new hnswlib::L2Space(dim); - } - else if(space_name=="ip") { - l2space = new hnswlib::InnerProductSpace(dim); - } - else if(space_name=="cosine") { - l2space = new hnswlib::InnerProductSpace(dim); - normalize=true; - } - appr_alg = NULL; - ep_added = true; - index_inited = false; - num_threads_default = std::thread::hardware_concurrency(); + Index(const std::string &space_name, const int dim) : + space_name(space_name), dim(dim) { + normalize=false; + if(space_name=="l2") { + l2space = new hnswlib::L2Space(dim); + } + else if(space_name=="ip") { + l2space = new hnswlib::InnerProductSpace(dim); + } + else if(space_name=="cosine") { + l2space = new hnswlib::InnerProductSpace(dim); + normalize=true; } + appr_alg = NULL; + ep_added = true; + index_inited = false; + num_threads_default = std::thread::hardware_concurrency(); + + default_ef=10; + } + + static const int ser_version = 1; // serialization version + + std::string space_name; + int dim; + size_t seed; + size_t default_ef; + + bool index_inited; + bool ep_added; + bool normalize; + int num_threads_default; + hnswlib::labeltype cur_l; + hnswlib::HierarchicalNSW *appr_alg; + hnswlib::SpaceInterface *l2space; + + ~Index() { + delete l2space; + if (appr_alg) + delete appr_alg; + } void init_new_index(const size_t maxElements, const size_t M, const size_t efConstruction, const size_t random_seed) { if (appr_alg) { @@ -101,19 +135,17 @@ class Index { appr_alg = new hnswlib::HierarchicalNSW(l2space, maxElements, M, efConstruction, random_seed); index_inited = true; ep_added = false; + appr_alg->ef_ = default_ef; + seed=random_seed; } + void set_ef(size_t ef) { + default_ef=ef; + if (appr_alg) appr_alg->ef_ = ef; } - size_t get_ef_construction() { - return appr_alg->ef_construction_; - } - - size_t get_M() { - return appr_alg->M_; - } void set_num_threads(int num_threads) { this->num_threads_default = num_threads; @@ -124,21 +156,22 @@ class Index { } void loadIndex(const std::string &path_to_index, size_t max_elements) { - if (appr_alg) { - std::cerr<<"Warning: Calling load_index for an already inited index. Old index is being deallocated."; - delete appr_alg; - } - appr_alg = new hnswlib::HierarchicalNSW(l2space, path_to_index, false, max_elements); - cur_l = appr_alg->cur_element_count; + if (appr_alg) { + std::cerr<<"Warning: Calling load_index for an already inited index. Old index is being deallocated."; + delete appr_alg; + } + appr_alg = new hnswlib::HierarchicalNSW(l2space, path_to_index, false, max_elements); + cur_l = appr_alg->cur_element_count; + } + + void normalize_vector(float *data, float *norm_array){ + float norm=0.0f; + for(int i=0;i items(input); @@ -162,7 +195,6 @@ class Index { throw std::runtime_error("wrong dimensionality of the vectors"); // avoid using threads when the number of searches is small: - if(rows<=num_threads*4){ num_threads=1; } @@ -189,20 +221,19 @@ class Index { { - int start = 0; - if (!ep_added) { - size_t id = ids.size() ? ids.at(0) : (cur_l); - float *vector_data=(float *) items.data(0); - std::vector norm_array(dim); - if(normalize){ - normalize_vector(vector_data, norm_array.data()); - vector_data = norm_array.data(); - - } - appr_alg->addPoint((void *) vector_data, (size_t) id); - start = 1; - ep_added = true; + int start = 0; + if (!ep_added) { + size_t id = ids.size() ? ids.at(0) : (cur_l); + float *vector_data=(float *) items.data(0); + std::vector norm_array(dim); + if(normalize){ + normalize_vector(vector_data, norm_array.data()); + vector_data = norm_array.data(); } + appr_alg->addPoint((void *) vector_data, (size_t) id); + start = 1; + ep_added = true; + } py::gil_scoped_release l; if(normalize==false) { @@ -214,7 +245,7 @@ class Index { std::vector norm_array(num_threads * dim); ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) { // normalize vector: - size_t start_idx = threadId * dim; + size_t start_idx = threadId * dim; normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); size_t id = ids.size() ? ids.at(row) : (cur_l+row); @@ -254,6 +285,255 @@ class Index { return ids; } + + py::dict getAnnData() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + + + + std::unique_lock templock(appr_alg->global); + + unsigned int level0_npy_size = appr_alg->cur_element_count * appr_alg->size_data_per_element_; + unsigned int link_npy_size = 0; + std::vector link_npy_offsets(appr_alg->cur_element_count); + + for (size_t i = 0; i < appr_alg->cur_element_count; i++){ + unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + link_npy_offsets[i]=link_npy_size; + if (linkListSize) + link_npy_size += linkListSize; + } + + char* data_level0_npy = (char *) malloc(level0_npy_size); + char* link_list_npy = (char *) malloc(link_npy_size); + int* element_levels_npy = (int *) malloc(appr_alg->element_levels_.size()*sizeof(int)); + + hnswlib::labeltype* label_lookup_key_npy = (hnswlib::labeltype *) malloc(appr_alg->label_lookup_.size()*sizeof(hnswlib::labeltype)); + hnswlib::tableint* label_lookup_val_npy = (hnswlib::tableint *) malloc(appr_alg->label_lookup_.size()*sizeof(hnswlib::tableint)); + + memset(label_lookup_key_npy, -1, appr_alg->label_lookup_.size()*sizeof(hnswlib::labeltype)); + memset(label_lookup_val_npy, -1, appr_alg->label_lookup_.size()*sizeof(hnswlib::tableint)); + + size_t idx=0; + for ( auto it = appr_alg->label_lookup_.begin(); it != appr_alg->label_lookup_.end(); ++it ){ + label_lookup_key_npy[idx]= it->first; + label_lookup_val_npy[idx]= it->second; + idx++; + } + + memset(link_list_npy, 0, link_npy_size); + + memcpy(data_level0_npy, appr_alg->data_level0_memory_, level0_npy_size); + memcpy(element_levels_npy, appr_alg->element_levels_.data(), appr_alg->element_levels_.size() * sizeof(int)); + + for (size_t i = 0; i < appr_alg->cur_element_count; i++){ + unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + if (linkListSize){ + memcpy(link_list_npy+link_npy_offsets[i], appr_alg->linkLists_[i], linkListSize); + } + } + + py::capsule free_when_done_l0(data_level0_npy, [](void *f) { + delete[] f; + }); + py::capsule free_when_done_lvl(element_levels_npy, [](void *f) { + delete[] f; + }); + py::capsule free_when_done_lb(label_lookup_key_npy, [](void *f) { + delete[] f; + }); + py::capsule free_when_done_id(label_lookup_val_npy, [](void *f) { + delete[] f; + }); + py::capsule free_when_done_ll(link_list_npy, [](void *f) { + delete[] f; + }); + + /* TODO: serialize state of random generators appr_alg->level_generator_ and appr_alg->update_probability_generator_ */ + /* for full reproducibility / to avoid re-initializing generators inside Index::createFromParams */ + + return py::dict( + "offset_level0"_a=appr_alg->offsetLevel0_, + "max_elements"_a=appr_alg->max_elements_, + "cur_element_count"_a=appr_alg->cur_element_count, + "size_data_per_element"_a=appr_alg->size_data_per_element_, + "label_offset"_a=appr_alg->label_offset_, + "offset_data"_a=appr_alg->offsetData_, + "max_level"_a=appr_alg->maxlevel_, + "enterpoint_node"_a=appr_alg->enterpoint_node_, + "max_M"_a=appr_alg->maxM_, + "max_M0"_a=appr_alg->maxM0_, + "M"_a=appr_alg->M_, + "mult"_a=appr_alg->mult_, + "ef_construction"_a=appr_alg->ef_construction_, + "ef"_a=appr_alg->ef_, + "has_deletions"_a=appr_alg->has_deletions_, + "size_links_per_element"_a=appr_alg->size_links_per_element_, + + "label_lookup_external"_a=py::array_t( + {appr_alg->label_lookup_.size()}, // shape + {sizeof(hnswlib::labeltype)}, // C-style contiguous strides for double + label_lookup_key_npy, // the data pointer + free_when_done_lb), + + "label_lookup_internal"_a=py::array_t( + {appr_alg->label_lookup_.size()}, // shape + {sizeof(hnswlib::tableint)}, // C-style contiguous strides for double + label_lookup_val_npy, // the data pointer + free_when_done_id), + + "element_levels"_a=py::array_t( + {appr_alg->element_levels_.size()}, // shape + {sizeof(int)}, // C-style contiguous strides for double + element_levels_npy, // the data pointer + free_when_done_lvl), + + // linkLists_,element_levels_,data_level0_memory_ + "data_level0"_a=py::array_t( + {level0_npy_size}, // shape + {sizeof(char)}, // C-style contiguous strides for double + data_level0_npy, // the data pointer + free_when_done_l0), + + "link_lists"_a=py::array_t( + {link_npy_size}, // shape + {sizeof(char)}, // C-style contiguous strides for double + link_list_npy, // the data pointer + free_when_done_ll) + + ); + + + } + + + py::dict getIndexParams() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + auto params = py::dict( + "ser_version"_a=py::int_(Index::ser_version), //serialization version + "space"_a=space_name, + "dim"_a=dim, + "index_inited"_a=index_inited, + "ep_added"_a=ep_added, + "normalize"_a=normalize, + "num_threads"_a=num_threads_default, + "seed"_a=seed + ); + + if(index_inited == false) + return py::dict( **params, "ef"_a=default_ef); + + auto ann_params = getAnnData(); + + return py::dict(**params, **ann_params); + } + + + static Index * createFromParams(const py::dict d) { + + // check serialization version + assert_true(((int)py::int_(Index::ser_version)) >= d["ser_version"].cast(), "Invalid serialization version!"); + + auto space_name_=d["space"].cast(); + auto dim_=d["dim"].cast(); + auto index_inited_=d["index_inited"].cast(); + + Index *new_index = new Index(space_name_, dim_); + + /* TODO: deserialize state of random generators into new_index->level_generator_ and new_index->update_probability_generator_ */ + /* for full reproducibility / state of generators is serialized inside Index::getIndexParams */ + new_index->seed = d["seed"].cast(); + + if (index_inited_){ + new_index->appr_alg = new hnswlib::HierarchicalNSW(new_index->l2space, d["max_elements"].cast(), d["M"].cast(), d["ef_construction"].cast(), new_index->seed); + new_index->cur_l = d["cur_element_count"].cast(); + } + + new_index->index_inited = index_inited_; + new_index->ep_added=d["ep_added"].cast(); + new_index->num_threads_default=d["num_threads"].cast(); + new_index->default_ef=d["ef"].cast(); + + if (index_inited_) + new_index->setAnnData(d); + + return new_index; + } + + static Index * createFromIndex(const Index & index) { + return createFromParams(index.getIndexParams()); + } + + void setAnnData(const py::dict d) { /* WARNING: Index::setAnnData is not thread-safe with Index::addItems */ + + + std::unique_lock templock(appr_alg->global); + + assert_true(appr_alg->offsetLevel0_ == d["offset_level0"].cast(), "Invalid value of offsetLevel0_ "); + assert_true(appr_alg->max_elements_ == d["max_elements"].cast(), "Invalid value of max_elements_ "); + + appr_alg->cur_element_count = d["cur_element_count"].cast(); + + assert_true(appr_alg->size_data_per_element_ == d["size_data_per_element"].cast(), "Invalid value of size_data_per_element_ "); + assert_true(appr_alg->label_offset_ == d["label_offset"].cast(), "Invalid value of label_offset_ "); + assert_true(appr_alg->offsetData_ == d["offset_data"].cast(), "Invalid value of offsetData_ "); + + appr_alg->maxlevel_ = d["max_level"].cast(); + appr_alg->enterpoint_node_ = d["enterpoint_node"].cast(); + + assert_true(appr_alg->maxM_ == d["max_M"].cast(), "Invalid value of maxM_ "); + assert_true(appr_alg->maxM0_ == d["max_M0"].cast(), "Invalid value of maxM0_ "); + assert_true(appr_alg->M_ == d["M"].cast(), "Invalid value of M_ "); + assert_true(appr_alg->mult_ == d["mult"].cast(), "Invalid value of mult_ "); + assert_true(appr_alg->ef_construction_ == d["ef_construction"].cast(), "Invalid value of ef_construction_ "); + + appr_alg->ef_ = d["ef"].cast(); + appr_alg->has_deletions_=d["has_deletions"].cast(); + + assert_true(appr_alg->size_links_per_element_ == d["size_links_per_element"].cast(), "Invalid value of size_links_per_element_ "); + + auto label_lookup_key_npy = d["label_lookup_external"].cast >(); + auto label_lookup_val_npy = d["label_lookup_internal"].cast >(); + auto element_levels_npy = d["element_levels"].cast >(); + auto data_level0_npy = d["data_level0"].cast >(); + auto link_list_npy = d["link_lists"].cast >(); + + for (size_t i = 0; i < appr_alg->cur_element_count; i++){ + if (label_lookup_val_npy.data()[i] < 0){ + throw std::runtime_error("internal id cannot be negative!"); + } + else{ + appr_alg->label_lookup_.insert(std::make_pair(label_lookup_key_npy.data()[i], label_lookup_val_npy.data()[i])); + } + } + + memcpy(appr_alg->element_levels_.data(), element_levels_npy.data(), element_levels_npy.nbytes()); + + unsigned int link_npy_size = 0; + std::vector link_npy_offsets(appr_alg->cur_element_count); + + for (size_t i = 0; i < appr_alg->cur_element_count; i++){ + unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + link_npy_offsets[i]=link_npy_size; + if (linkListSize) + link_npy_size += linkListSize; + } + + memcpy(appr_alg->data_level0_memory_, data_level0_npy.data(), data_level0_npy.nbytes()); + + for (size_t i = 0; i < appr_alg->max_elements_; i++) { + unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + if (linkListSize == 0) { + appr_alg->linkLists_[i] = nullptr; + } else { + appr_alg->linkLists_[i] = (char *) malloc(linkListSize); + if (appr_alg->linkLists_[i] == nullptr) + throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklist"); + + memcpy(appr_alg->linkLists_[i], link_list_npy.data()+link_npy_offsets[i], linkListSize); + + } + } +} + py::object knnQuery_return_numpy(py::object input, size_t k = 1, int num_threads = -1) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); @@ -310,7 +590,7 @@ class Index { float *data= (float *) items.data(row); size_t start_idx = threadId * dim; - normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); + normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); std::priority_queue> result = appr_alg->searchKnn( (void *) (norm_array.data()+start_idx), k); @@ -367,50 +647,69 @@ class Index { return appr_alg->cur_element_count; } - std::string space_name; - int dim; - +}; - bool index_inited; - bool ep_added; - bool normalize; - int num_threads_default; - hnswlib::labeltype cur_l; - hnswlib::HierarchicalNSW *appr_alg; - hnswlib::SpaceInterface *l2space; - ~Index() { - delete l2space; - if (appr_alg) - delete appr_alg; - } -}; PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); py::class_>(m, "Index") + .def(py::init(&Index::createFromParams), py::arg("params")) + /* WARNING: Index::createFromIndex is not thread-safe with Index::addItems */ + .def(py::init(&Index::createFromIndex), py::arg("index")) .def(py::init(), py::arg("space"), py::arg("dim")) - .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, - py::arg("ef_construction")=200, py::arg("random_seed")=100) + .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, py::arg("ef_construction")=200, py::arg("random_seed")=100) .def("knn_query", &Index::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1, py::arg("num_threads")=-1) .def("add_items", &Index::addItems, py::arg("data"), py::arg("ids") = py::none(), py::arg("num_threads")=-1) .def("get_items", &Index::getDataReturnList, py::arg("ids") = py::none()) .def("get_ids_list", &Index::getIdsList) .def("set_ef", &Index::set_ef, py::arg("ef")) - .def("get_ef_construction", &Index::get_ef_construction) - .def("get_M", &Index::get_M) .def("set_num_threads", &Index::set_num_threads, py::arg("num_threads")) .def("save_index", &Index::saveIndex, py::arg("path_to_index")) .def("load_index", &Index::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0) .def("mark_deleted", &Index::markDeleted, py::arg("label")) .def("resize_index", &Index::resizeIndex, py::arg("new_size")) - .def("get_max_elements", &Index::getMaxElements) - .def("get_current_count", &Index::getCurrentCount) - .def("__repr__", - [](const Index &a) { - return ""; - } - ); + .def_readonly("space", &Index::space_name) + .def_readonly("dim", &Index::dim) + .def_readwrite("num_threads", &Index::num_threads_default) + .def_property("ef", + [](const Index & index) { + return index.index_inited ? index.appr_alg->ef_ : index.default_ef; + }, + [](Index & index, const size_t ef_) { + index.default_ef=ef_; + if (index.appr_alg) + index.appr_alg->ef_ = ef_; + }) + .def_property_readonly("max_elements", [](const Index & index) { + return index.index_inited ? index.appr_alg->max_elements_ : 0; + }) + .def_property_readonly("element_count", [](const Index & index) { + return index.index_inited ? index.appr_alg->cur_element_count : 0; + }) + .def_property_readonly("ef_construction", [](const Index & index) { + return index.index_inited ? index.appr_alg->ef_construction_ : 0; + }) + .def_property_readonly("M", [](const Index & index) { + return index.index_inited ? index.appr_alg->M_ : 0; + }) + + .def(py::pickle( + [](const Index &ind) { // __getstate__ + return py::make_tuple(ind.getIndexParams()); /* Return dict (wrapped in a tuple) that fully encodes state of the Index object */ + }, + [](py::tuple t) { // __setstate__ + if (t.size() != 1) + throw std::runtime_error("Invalid state!"); + + return Index::createFromParams(t[0].cast()); + } + )) + + .def("__repr__", [](const Index &a) { + return ""; + }); + return m.ptr(); } diff --git a/python_bindings/hnswlib b/python_bindings/hnswlib deleted file mode 120000 index 236d6575..00000000 --- a/python_bindings/hnswlib +++ /dev/null @@ -1 +0,0 @@ -../hnswlib \ No newline at end of file diff --git a/python_bindings/requirements.txt b/python_bindings/requirements.txt deleted file mode 100644 index 81fbf192..00000000 --- a/python_bindings/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -numpy>=1.10.0 -pybind11>=2.0 \ No newline at end of file diff --git a/python_bindings/setup.py b/python_bindings/setup.py deleted file mode 100644 index a6dfb81b..00000000 --- a/python_bindings/setup.py +++ /dev/null @@ -1,117 +0,0 @@ -import os -from setuptools import setup, Extension -from setuptools.command.build_ext import build_ext -import sys -import setuptools - -__version__ = '0.4.0' - - -source_files = ['bindings.cpp'] - -libraries = [] -extra_objects = [] - - -ext_modules = [ - Extension( - 'hnswlib', - source_files, - # include_dirs=[os.path.join(libdir, "include")], - libraries=libraries, - language='c++', - extra_objects=extra_objects, - ), -] - - -# As of Python 3.6, CCompiler has a `has_flag` method. -# cf http://bugs.python.org/issue26689 -def has_flag(compiler, flagname): - """Return a boolean indicating whether a flag name is supported on - the specified compiler. - """ - import tempfile - with tempfile.NamedTemporaryFile('w', suffix='.cpp') as f: - f.write('int main (int argc, char **argv) { return 0; }') - try: - compiler.compile([f.name], extra_postargs=[flagname]) - except setuptools.distutils.errors.CompileError: - return False - return True - - -def cpp_flag(compiler): - """Return the -std=c++[11/14] compiler flag. - The c++14 is prefered over c++11 (when it is available). - """ - if has_flag(compiler, '-std=c++14'): - return '-std=c++14' - elif has_flag(compiler, '-std=c++11'): - return '-std=c++11' - else: - raise RuntimeError('Unsupported compiler -- at least C++11 support ' - 'is needed!') - - -class BuildExt(build_ext): - """A custom build extension for adding compiler-specific options.""" - c_opts = { - 'msvc': ['/EHsc', '/openmp', '/O2'], - 'unix': ['-O3', '-march=native'], # , '-w' - } - link_opts = { - 'unix': [], - 'msvc': [], - } - - if sys.platform == 'darwin': - c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] - link_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] - else: - c_opts['unix'].append("-fopenmp") - link_opts['unix'].extend(['-fopenmp', '-pthread']) - - def build_extensions(self): - ct = self.compiler.compiler_type - opts = self.c_opts.get(ct, []) - if ct == 'unix': - opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) - opts.append(cpp_flag(self.compiler)) - if has_flag(self.compiler, '-fvisibility=hidden'): - opts.append('-fvisibility=hidden') - elif ct == 'msvc': - opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) - - # extend include dirs here (don't assume numpy/pybind11 are installed when first run, since - # pip could have installed them as part of executing this script - import pybind11 - import numpy as np - for ext in self.extensions: - ext.extra_compile_args.extend(opts) - ext.extra_link_args.extend(self.link_opts.get(ct, [])) - ext.include_dirs.extend([ - # Path to pybind11 headers - pybind11.get_include(), - pybind11.get_include(True), - - # Path to numpy headers - np.get_include() - ]) - - build_ext.build_extensions(self) - - -setup( - name='hnswlib', - version=__version__, - description='hnswlib', - author='Yury Malkov and others', - url='https://github.com/yurymalkov/hnsw', - long_description="""hnsw""", - ext_modules=ext_modules, - install_requires=['pybind11>=2.0', 'numpy'], - cmdclass={'build_ext': BuildExt}, - test_suite="tests", - zip_safe=False, -) diff --git a/python_bindings/setup.py b/python_bindings/setup.py new file mode 120000 index 00000000..f8f80fc2 --- /dev/null +++ b/python_bindings/setup.py @@ -0,0 +1 @@ +../setup.py \ No newline at end of file diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index afc663af..d718bc3b 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -1,10 +1,13 @@ +import os import unittest +import numpy as np + +import hnswlib + class RandomSelfTestCase(unittest.TestCase): def testRandomSelf(self): - import hnswlib - import numpy as np dim = 16 num_elements = 10000 @@ -40,19 +43,19 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data1, k=1) - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) # Serializing and deleting the index: - index_path='first_half.bin' + index_path = 'first_half.bin' print("Saving index to '%s'" % index_path) - p.save_index("first_half.bin") + p.save_index(index_path) del p # Reiniting, loading the index p = hnswlib.Index(space='l2', dim=dim) # you can change the sa - print("\nLoading index from 'first_half.bin'\n") - p.load_index("first_half.bin") + print("\nLoading index from '%s'\n" % index_path) + p.load_index(index_path) print("Adding the second batch of %d elements" % (len(data2))) p.add_items(data2) @@ -60,8 +63,6 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data, k=1) - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) + + os.remove(index_path) diff --git a/python_bindings/tests/bindings_test_getdata.py b/python_bindings/tests/bindings_test_getdata.py index 3e234518..8655d7f8 100644 --- a/python_bindings/tests/bindings_test_getdata.py +++ b/python_bindings/tests/bindings_test_getdata.py @@ -1,11 +1,13 @@ import unittest +import numpy as np + +import hnswlib + class RandomSelfTestCase(unittest.TestCase): def testGettingItems(self): print("\n**** Getting the data by label test ****\n") - import hnswlib - import numpy as np dim = 16 num_elements = 10000 @@ -42,6 +44,3 @@ def testGettingItems(self): # After adding them, all labels should be retrievable returned_items = p.get_items(labels) self.assertSequenceEqual(data.tolist(), returned_items) - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/python_bindings/tests/bindings_test_labels.py b/python_bindings/tests/bindings_test_labels.py index c1887bef..5c13e198 100644 --- a/python_bindings/tests/bindings_test_labels.py +++ b/python_bindings/tests/bindings_test_labels.py @@ -1,126 +1,127 @@ +import os import unittest +import numpy as np -class RandomSelfTestCase(unittest.TestCase): - def testRandomSelf(self): - for idx in range(16): - print("\n**** Index save-load test ****\n") - import hnswlib - import numpy as np - - np.random.seed(idx) - dim = 16 - num_elements = 10000 - - # Generating sample data - data = np.float32(np.random.random((num_elements, dim))) - - # Declaring index - p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip - - # Initing index - # max_elements - the maximum number of elements, should be known beforehand - # (probably will be made optional in the future) - # - # ef_construction - controls index search speed/build speed tradeoff - # M - is tightly connected with internal dimensionality of the data - # stronlgy affects the memory consumption - - p.init_index(max_elements = num_elements, ef_construction = 100, M = 16) - - # Controlling the recall by setting ef: - # higher ef leads to better accuracy, but slower search - p.set_ef(100) - - p.set_num_threads(4) # by default using all available cores - - # We split the data in two batches: - data1 = data[:num_elements // 2] - data2 = data[num_elements // 2:] - - print("Adding first batch of %d elements" % (len(data1))) - p.add_items(data1) - - # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data1, k=1) - - items=p.get_items(labels) - - # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) - - # Check that the returned element data is correct: - diff_with_gt_labels=np.mean(np.abs(data1-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) - - # Serializing and deleting the index. - # We need the part to check that serialization is working properly. - - index_path='first_half.bin' - print("Saving index to '%s'" % index_path) - p.save_index("first_half.bin") - print("Saved. Deleting...") - del p - print("Deleted") +import hnswlib - print("\n**** Mark delete test ****\n") - # Reiniting, loading the index - print("Reiniting") - p = hnswlib.Index(space='l2', dim=dim) - print("\nLoading index from 'first_half.bin'\n") - p.load_index("first_half.bin") - p.set_ef(100) - - print("Adding the second batch of %d elements" % (len(data2))) - p.add_items(data2) - - # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data, k=1) - items=p.get_items(labels) - - # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) - - # Check that the returned element data is correct: - diff_with_gt_labels=np.mean(np.abs(data-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) # deleting index. - - # Checking that all labels are returned correctly: - sorted_labels=sorted(p.get_ids_list()) - self.assertEqual(np.sum(~np.asarray(sorted_labels)==np.asarray(range(num_elements))),0) - - # Delete data1 - labels1, _ = p.knn_query(data1, k=1) - - for l in labels1: - p.mark_deleted(l[0]) - labels2, _ = p.knn_query(data2, k=1) - items=p.get_items(labels2) - diff_with_gt_labels=np.mean(np.abs(data2-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-3) # console - - - labels1_after, _ = p.knn_query(data1, k=1) - for la in labels1_after: - for lb in labels1: - if la[0] == lb[0]: - self.assertTrue(False) - print("All the data in data1 are removed") - - # checking saving/loading index with elements marked as deleted - p.save_index("with_deleted.bin") - p = hnswlib.Index(space='l2', dim=dim) - p.load_index("with_deleted.bin") - p.set_ef(100) - - labels1_after, _ = p.knn_query(data1, k=1) - for la in labels1_after: - for lb in labels1: - if la[0] == lb[0]: - self.assertTrue(False) +class RandomSelfTestCase(unittest.TestCase): + def testRandomSelf(self): + for idx in range(16): + print("\n**** Index save-load test ****\n") + np.random.seed(idx) + dim = 16 + num_elements = 10000 + # Generating sample data + data = np.float32(np.random.random((num_elements, dim))) -if __name__ == "__main__": - unittest.main() + # Declaring index + p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip + + # Initing index + # max_elements - the maximum number of elements, should be known beforehand + # (probably will be made optional in the future) + # + # ef_construction - controls index search speed/build speed tradeoff + # M - is tightly connected with internal dimensionality of the data + # stronlgy affects the memory consumption + + p.init_index(max_elements=num_elements, ef_construction=100, M=16) + + # Controlling the recall by setting ef: + # higher ef leads to better accuracy, but slower search + p.set_ef(100) + + p.set_num_threads(4) # by default using all available cores + + # We split the data in two batches: + data1 = data[:num_elements // 2] + data2 = data[num_elements // 2:] + + print("Adding first batch of %d elements" % (len(data1))) + p.add_items(data1) + + # Query the elements for themselves and measure recall: + labels, distances = p.knn_query(data1, k=1) + + items=p.get_items(labels) + + # Check the recall: + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) + + # Check that the returned element data is correct: + diff_with_gt_labels=np.mean(np.abs(data1-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) + + # Serializing and deleting the index. + # We need the part to check that serialization is working properly. + + index_path = 'first_half.bin' + print("Saving index to '%s'" % index_path) + p.save_index(index_path) + print("Saved. Deleting...") + del p + print("Deleted") + + print("\n**** Mark delete test ****\n") + # Reiniting, loading the index + print("Reiniting") + p = hnswlib.Index(space='l2', dim=dim) + + print("\nLoading index from '%s'\n" % index_path) + p.load_index(index_path) + p.set_ef(100) + + print("Adding the second batch of %d elements" % (len(data2))) + p.add_items(data2) + + # Query the elements for themselves and measure recall: + labels, distances = p.knn_query(data, k=1) + items=p.get_items(labels) + + # Check the recall: + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) + + # Check that the returned element data is correct: + diff_with_gt_labels=np.mean(np.abs(data-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # deleting index. + + # Checking that all labels are returned correctly: + sorted_labels=sorted(p.get_ids_list()) + self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0) + + # Delete data1 + labels1, _ = p.knn_query(data1, k=1) + + for l in labels1: + p.mark_deleted(l[0]) + labels2, _ = p.knn_query(data2, k=1) + items=p.get_items(labels2) + diff_with_gt_labels = np.mean(np.abs(data2-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-3) # console + + labels1_after, _ = p.knn_query(data1, k=1) + for la in labels1_after: + for lb in labels1: + if la[0] == lb[0]: + self.assertTrue(False) + print("All the data in data1 are removed") + + # checking saving/loading index with elements marked as deleted + del_index_path = "with_deleted.bin" + p.save_index(del_index_path) + p = hnswlib.Index(space='l2', dim=dim) + p.load_index(del_index_path) + p.set_ef(100) + + labels1_after, _ = p.knn_query(data1, k=1) + for la in labels1_after: + for lb in labels1: + if la[0] == lb[0]: + self.assertTrue(False) + + os.remove(index_path) + os.remove(del_index_path) diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py new file mode 100644 index 00000000..3a42df2e --- /dev/null +++ b/python_bindings/tests/bindings_test_pickle.py @@ -0,0 +1,152 @@ +import pickle +import unittest + +import numpy as np + +import hnswlib + + +def get_dist(metric, pt1, pt2): + if metric == 'l2': + return np.sum((pt1-pt2)**2) + elif metric == 'ip': + return 1. - np.sum(np.multiply(pt1, pt2)) + elif metric == 'cosine': + return 1. - np.sum(np.multiply(pt1, pt2)) / (np.sum(pt1**2) * np.sum(pt2**2))**.5 + + +def brute_force_distances(metric, items, query_items, k): + dists = np.zeros((query_items.shape[0], items.shape[0])) + for ii in range(items.shape[0]): + for jj in range(query_items.shape[0]): + dists[jj,ii] = get_dist(metric, items[ii, :], query_items[jj, :]) + + labels = np.argsort(dists, axis=1) # equivalent, but faster: np.argpartition(dists, range(k), axis=1) + dists = np.sort(dists, axis=1) # equivalent, but faster: np.partition(dists, range(k), axis=1) + + return labels[:, :k], dists[:, :k] + + +def check_ann_results(self, metric, items, query_items, k, ann_l, ann_d, err_thresh=0, total_thresh=0, dists_thresh=0): + brute_l, brute_d = brute_force_distances(metric, items, query_items, k) + err_total = 0 + for jj in range(query_items.shape[0]): + err = np.sum(np.isin(brute_l[jj, :], ann_l[jj, :], invert=True)) + if err > 0: + print(f"Warning: {err} labels are missing from ann results (k={k}, err_thresh={err_thresh})") + + if err > err_thresh: + err_total += 1 + + self.assertLessEqual(err_total, total_thresh, f"Error: knn_query returned incorrect labels for {err_total} items (k={k})") + + wrong_dists = np.sum(((brute_d - ann_d)**2.) > 1e-3) + if wrong_dists > 0: + dists_count = brute_d.shape[0]*brute_d.shape[1] + print(f"Warning: {wrong_dists} ann distance values are different from brute-force values (total # of values={dists_count}, dists_thresh={dists_thresh})") + + self.assertLessEqual(wrong_dists, dists_thresh, msg=f"Error: {wrong_dists} ann distance values are different from brute-force values") + + +def test_space_main(self, space, dim): + + # Generating sample data + data = np.float32(np.random.random((self.num_elements, dim))) + test_data = np.float32(np.random.random((self.num_test_elements, dim))) + + # Declaring index + p = hnswlib.Index(space=space, dim=dim) # possible options are l2, cosine or ip + print(f"Running pickle tests for {p}") + + p.num_threads = self.num_threads # by default using all available cores + + p0 = pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index + p.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M) + p0.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M) + + p.ef = self.ef + p0.ef = self.ef + + p1 = pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + + ### add items to ann index p,p0,p1 + p.add_items(data) + p1.add_items(data) + p0.add_items(data) + + p2=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + + self.assertTrue(np.allclose(p.get_items(), p0.get_items()), "items for p and p0 must be same") + self.assertTrue(np.allclose(p0.get_items(), p1.get_items()), "items for p0 and p1 must be same") + self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same") + + ### Test if returned distances are same + l, d = p.knn_query(test_data, k=self.k) + l0, d0 = p0.knn_query(test_data, k=self.k) + l1, d1 = p1.knn_query(test_data, k=self.k) + l2, d2 = p2.knn_query(test_data, k=self.k) + + self.assertLessEqual(np.sum(((d-d0)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p and p0 must match") + self.assertLessEqual(np.sum(((d0-d1)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match") + self.assertLessEqual(np.sum(((d1-d2)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p1 and p2 must match") + + ### check if ann results match brute-force search + ### allow for 2 labels to be missing from ann results + check_ann_results(self, space, data, test_data, self.k, l, d, + err_thresh=self.label_err_thresh, + total_thresh=self.item_err_thresh, + dists_thresh=self.dists_err_thresh) + + check_ann_results(self, space, data, test_data, self.k, l2, d2, + err_thresh=self.label_err_thresh, + total_thresh=self.item_err_thresh, + dists_thresh=self.dists_err_thresh) + + ### Check ef parameter value + self.assertEqual(p.ef, self.ef, "incorrect value of p.ef") + self.assertEqual(p0.ef, self.ef, "incorrect value of p0.ef") + self.assertEqual(p2.ef, self.ef, "incorrect value of p2.ef") + self.assertEqual(p1.ef, self.ef, "incorrect value of p1.ef") + + ### Check M parameter value + self.assertEqual(p.M, self.M, "incorrect value of p.M") + self.assertEqual(p0.M, self.M, "incorrect value of p0.M") + self.assertEqual(p1.M, self.M, "incorrect value of p1.M") + self.assertEqual(p2.M, self.M, "incorrect value of p2.M") + + ### Check ef_construction parameter value + self.assertEqual(p.ef_construction, self.ef_construction, "incorrect value of p.ef_construction") + self.assertEqual(p0.ef_construction, self.ef_construction, "incorrect value of p0.ef_construction") + self.assertEqual(p1.ef_construction, self.ef_construction, "incorrect value of p1.ef_construction") + self.assertEqual(p2.ef_construction, self.ef_construction, "incorrect value of p2.ef_construction") + + +class PickleUnitTests(unittest.TestCase): + + def setUp(self): + + self.ef_construction = 725 + self.M = 64 + self.ef = 725 + + self.num_elements = 5000 + self.num_test_elements = 200 + + self.num_threads = 4 + self.k = 25 + + self.label_err_thresh = 5 ### max number of missing labels allowed per test item + self.item_err_thresh = 5 ### max number of items allowed with incorrect labels + + self.dists_err_thresh = 50 ### for two matrices, d1 and d2, dists_err_thresh controls max + ### number of value pairs that are allowed to be different in d1 and d2 + ### i.e., number of values that are (d1-d2)**2>1e-3 + + def test_inner_product_space(self): + test_space_main(self, 'ip', 48) + + def test_l2_space(self): + test_space_main(self, 'l2', 153) + + def test_cosine_space(self): + test_space_main(self, 'cosine', 512) diff --git a/python_bindings/tests/bindings_test_resize.py b/python_bindings/tests/bindings_test_resize.py index 9411af64..3c4e3e4f 100644 --- a/python_bindings/tests/bindings_test_resize.py +++ b/python_bindings/tests/bindings_test_resize.py @@ -1,12 +1,15 @@ import unittest +import numpy as np + +import hnswlib + class RandomSelfTestCase(unittest.TestCase): def testRandomSelf(self): for idx in range(16): print("\n**** Index resize test ****\n") - import hnswlib - import numpy as np + np.random.seed(idx) dim = 16 num_elements = 10000 @@ -25,7 +28,7 @@ def testRandomSelf(self): # M - is tightly connected with internal dimensionality of the data # stronlgy affects the memory consumption - p.init_index(max_elements = num_elements//2, ef_construction = 100, M = 16) + p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search @@ -43,20 +46,18 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data1, k=1) - items=p.get_items(list(range(len(data1)))) + items = p.get_items(list(range(len(data1)))) # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) # Check that the returned element data is correct: - diff_with_gt_labels=np.max(np.abs(data1-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) + diff_with_gt_labels = np.max(np.abs(data1-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) print("Resizing the index") p.resize_index(num_elements) - - print("Adding the second batch of %d elements" % (len(data2))) p.add_items(data2) @@ -65,18 +66,12 @@ def testRandomSelf(self): items=p.get_items(list(range(num_elements))) # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) # Check that the returned element data is correct: diff_with_gt_labels=np.max(np.abs(data-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # Checking that all labels are returned correcly: sorted_labels=sorted(p.get_ids_list()) - self.assertEqual(np.sum(~np.asarray(sorted_labels)==np.asarray(range(num_elements))),0) - - - - -if __name__ == "__main__": - unittest.main() + self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0) diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..15665f31 --- /dev/null +++ b/setup.py @@ -0,0 +1,120 @@ +import os +import sys + +import numpy as np +import pybind11 +import setuptools +from setuptools import Extension, setup +from setuptools.command.build_ext import build_ext + +__version__ = '0.5.0' + + +include_dirs = [ + pybind11.get_include(), + np.get_include(), +] + +# compatibility when run in python_bindings +bindings_dir = 'python_bindings' +if bindings_dir in os.path.basename(os.getcwd()): + source_files = ['./bindings.cpp'] + include_dirs.extend(['../hnswlib/']) +else: + source_files = ['./python_bindings/bindings.cpp'] + include_dirs.extend(['./hnswlib/']) + + +libraries = [] +extra_objects = [] + + +ext_modules = [ + Extension( + 'hnswlib', + source_files, + include_dirs=include_dirs, + libraries=libraries, + language='c++', + extra_objects=extra_objects, + ), +] + + +# As of Python 3.6, CCompiler has a `has_flag` method. +# cf http://bugs.python.org/issue26689 +def has_flag(compiler, flagname): + """Return a boolean indicating whether a flag name is supported on + the specified compiler. + """ + import tempfile + with tempfile.NamedTemporaryFile('w', suffix='.cpp') as f: + f.write('int main (int argc, char **argv) { return 0; }') + try: + compiler.compile([f.name], extra_postargs=[flagname]) + except setuptools.distutils.errors.CompileError: + return False + return True + + +def cpp_flag(compiler): + """Return the -std=c++[11/14] compiler flag. + The c++14 is prefered over c++11 (when it is available). + """ + if has_flag(compiler, '-std=c++14'): + return '-std=c++14' + elif has_flag(compiler, '-std=c++11'): + return '-std=c++11' + else: + raise RuntimeError('Unsupported compiler -- at least C++11 support ' + 'is needed!') + + +class BuildExt(build_ext): + """A custom build extension for adding compiler-specific options.""" + c_opts = { + 'msvc': ['/EHsc', '/openmp', '/O2'], + 'unix': ['-O3', '-march=native'], # , '-w' + } + link_opts = { + 'unix': [], + 'msvc': [], + } + + if sys.platform == 'darwin': + c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] + link_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] + else: + c_opts['unix'].append("-fopenmp") + link_opts['unix'].extend(['-fopenmp', '-pthread']) + + def build_extensions(self): + ct = self.compiler.compiler_type + opts = self.c_opts.get(ct, []) + if ct == 'unix': + opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) + opts.append(cpp_flag(self.compiler)) + if has_flag(self.compiler, '-fvisibility=hidden'): + opts.append('-fvisibility=hidden') + elif ct == 'msvc': + opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) + + for ext in self.extensions: + ext.extra_compile_args.extend(opts) + ext.extra_link_args.extend(self.link_opts.get(ct, [])) + + build_ext.build_extensions(self) + + +setup( + name='hnswlib', + version=__version__, + description='hnswlib', + author='Yury Malkov and others', + url='https://github.com/yurymalkov/hnsw', + long_description="""hnsw""", + ext_modules=ext_modules, + install_requires=['numpy'], + cmdclass={'build_ext': BuildExt}, + zip_safe=False, +) diff --git a/sift_1b.cpp b/sift_1b.cpp index 273c9828..2739490c 100644 --- a/sift_1b.cpp +++ b/sift_1b.cpp @@ -242,11 +242,11 @@ void sift_test1B() { size_t vecdim = 128; char path_index[1024]; char path_gt[1024]; - char *path_q = "bigann/bigann_query.bvecs"; - char *path_data = "bigann/bigann_base.bvecs"; + char *path_q = "../bigann/bigann_query.bvecs"; + char *path_data = "../bigann/bigann_base.bvecs"; sprintf(path_index, "sift1b_%dm_ef_%d_M_%d.bin", subset_size_milllions, efConstruction, M); - sprintf(path_gt, "bigann/gnd/idx_%dM.ivecs", subset_size_milllions); + sprintf(path_gt, "../bigann/gnd/idx_%dM.ivecs", subset_size_milllions); unsigned char *massb = new unsigned char[vecdim];