Skip to content

Commit

Permalink
Added support for result hdf file, and added big knn recall
Browse files Browse the repository at this point in the history
  • Loading branch information
randersenYB committed Jul 20, 2024
1 parent 5bc8daf commit 3b056fb
Show file tree
Hide file tree
Showing 8 changed files with 304 additions and 65 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ venv
.idea
aerospike/data/*.hdf5
aerospike/data/*
aerospike/results/*
17 changes: 16 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@
"program": "${workspaceFolder}/aerospike/hdf_query.py",
"cwd": "${workspaceFolder}/aerospike",
"args": [
"--hdf", "random-xs-20-angular",
"--hdf", "${input:enterDataset}",
"--logfile", "./hdfquery.log",
"-r", "10"
],
Expand Down Expand Up @@ -193,6 +193,21 @@
],
"console": "integratedTerminal"
},
{
"name": "Python Debugger: hdf_query (gist 960)",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/aerospike/hdf_query.py",
"cwd": "${workspaceFolder}/aerospike",
"args": [
"--dataset", "gist-960-euclidean",
"--logfile", "./hdfquery-gist1.log",
"-r", "1",
"--limit", "10",
"--idxname", "ANN-data_euclidean_SQUARED_EUCLIDEAN_960_16_100_100_Idx"
],
"console": "integratedTerminal"
},
{
"name": "Python Debugger: hdf_create_dataset (prompt)",
"type": "debugpy",
Expand Down
213 changes: 175 additions & 38 deletions aerospike/aerospikehdf.py

Large diffs are not rendered by default.

12 changes: 8 additions & 4 deletions aerospike/baseaerospike.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ def __init__(self, runtimeArgs: argparse.Namespace, logger: logging.Logger):
self._remainingquerynbrs : int = None
self._query_current_run : int = None
self._query_metric_value : float = None
self._query_metric_big_value : float = None
self._aerospike_metric_value : float = None
self._query_metric : dict[str,any] = None
self._canchecknbrs : bool = False
Expand Down Expand Up @@ -393,6 +394,7 @@ def prometheus_status(self, i:int, done:bool = False) -> None:
"querymetric": None if self._query_metric is None else self._query_metric["type"],
"querymetricvalue": self._query_metric_value,
"querymetricaerospikevalue": self._aerospike_metric_value,
"querymetricbigvalue": self._query_metric_big_value,
"hnswparams": self.hnswstr(),
"queryef": queryef
})
Expand Down Expand Up @@ -434,8 +436,10 @@ def print_log(self, msg :str, logLevel :int = logging.INFO) -> None:
async def shutdown(self, waitforcompletion:bool):

if waitforcompletion and self._sleepexit > 0:
self.prometheus_status(0, True)
self.print_log(f'existing sleeping {self._sleepexit}')
self.prometheus_status(0, True)
self.print_log(f'existing sleeping {self._sleepexit}')
self._prometheus_meter_provider.force_flush()
self._prometheus_metric_reader.force_flush()
await asyncio.sleep(self._sleepexit)

self.print_log(f'done: {self}')
Expand All @@ -447,8 +451,8 @@ async def shutdown(self, waitforcompletion:bool):
self._heartbeat_thread.join(timeout=hbt+1)
self._logger.info(f"Shutdown Heartbeat...")

self._prometheus_meter_provider.force_flush()
self._prometheus_metric_reader.force_flush()
self._prometheus_meter_provider.force_flush(1000)
self._prometheus_metric_reader.force_flush(1000)
self._prometheus_meter_provider.shutdown()
#self._prometheus_metric_reader.shutdown()
self._prometheus_http_server[0].shutdown()
Expand Down
64 changes: 64 additions & 0 deletions aerospike/bigann/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import numpy as np

def compute_recall_without_distance_ties(true_ids, run_ids, count):
return len(set(true_ids) & set(run_ids))

def compute_recall_with_distance_ties(true_ids, true_dists, run_ids, count):
# This function assumes "true_dists" is monotonic either increasing or decreasing

found_tie = False
gt_size = np.shape(true_dists)[0]

if gt_size==count:
# nothing fancy to do in this case
recall = len(set(true_ids[:count]) & set(run_ids))

else:
dist_tie_check = true_dists[count-1] # tie check anchored at count-1 in GT dists

set_end = gt_size

for i in range(count, gt_size):
is_close = abs(dist_tie_check - true_dists[i] ) < 1e-6
if not is_close:
set_end = i
break

found_tie = set_end > count

recall = len(set(true_ids[:set_end]) & set(run_ids))

return recall, found_tie

def get_recall_values(true_nn, run_nn, count, count_ties=True):
true_ids, true_dists = true_nn
if not count_ties:
true_ids = true_ids[:, :count]
assert true_ids.shape == run_nn.shape
recalls = np.zeros(len(run_nn))
queries_with_ties = 0
# TODO probably not very efficient
for i in range(len(run_nn)):
if count_ties:
recalls[i], found_tie = compute_recall_with_distance_ties(true_ids[i], true_dists[i], run_nn[i], count)
if found_tie: queries_with_ties += 1
else:
recalls[i] = compute_recall_without_distance_ties(true_ids[i], run_nn[i], count)
return (np.mean(recalls) / float(count),
np.std(recalls) / float(count),
recalls,
queries_with_ties)

def knn(true_nn, run_nn, count, metrics):
if 'knn' not in metrics:
print('Computing knn metrics')
knn_metrics = metrics.create_group('knn')
mean, std, recalls, queries_with_ties = get_recall_values(true_nn, run_nn, count)
if queries_with_ties>0:
print("Warning: %d/%d queries contained ties accounted for in recall" % (queries_with_ties, len(run_nn)))
knn_metrics.attrs['mean'] = mean
knn_metrics.attrs['std'] = std
knn_metrics['recalls'] = recalls
else:
print("Found cached result")
return metrics['knn']
10 changes: 5 additions & 5 deletions aerospike/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def download(source_url: str, destination_path: str) -> None:
urlretrieve(source_url, destination_path)


def get_dataset_fn(dataset_name: str) -> Tuple[str,str]:
def get_dataset_fn(dataset_name: str, folder:str = "data") -> Tuple[str,str]:
"""
Returns the full file path for a given dataset name in the data directory.
Expand All @@ -31,8 +31,8 @@ def get_dataset_fn(dataset_name: str) -> Tuple[str,str]:
Returns:
str: The full file path of the dataset and the dataset name.
"""
if not os.path.exists("data"):
os.mkdir("data")
if not os.path.exists(folder):
os.mkdir(folder)

filename, fileext = os.path.splitext(dataset_name)
filenamewext : str = dataset_name
Expand All @@ -41,12 +41,12 @@ def get_dataset_fn(dataset_name: str) -> Tuple[str,str]:
filenamewext = f"{filename}.hdf5"

if (filenamewext[0] == os.path.sep
or filenamewext.startswith(f"data{os.path.sep}")
or filenamewext.startswith(f"{folder}{os.path.sep}")
or filenamewext.startswith(f".{os.path.sep}")):
splitpath = os.path.split(filename)
return filenamewext, splitpath[1]

return os.path.join("data", filenamewext), filename
return os.path.join(folder, filenamewext), filename

def get_dataset(dataset_name: str, hdfpath : str = None) -> Tuple[h5py.File, int]:
"""
Expand Down
18 changes: 1 addition & 17 deletions ann_benchmarks/algorithms/aerospike/config.yml
Original file line number Diff line number Diff line change
@@ -1,20 +1,4 @@
float:
angular:
- base_args: ['@metric', '@dimension']
constructor: Aerospike
disabled: false
docker_tag: ann-benchmarks-aerospike
module: ann_benchmarks.algorithms.aerospike
name: aerospike
run_groups:
cosine:
args: [
[cosine],
[{m: 16, ef_construction: 100, ef: 100}]
]
query_args: [
[]
]
float:
euclidean:
- base_args: ['@metric', '@dimension']
constructor: Aerospike
Expand Down
34 changes: 34 additions & 0 deletions ann_benchmarks/algorithms/aerospike/configangulareuclidean.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
float:
angular:
- base_args: ['@metric', '@dimension']
constructor: Aerospike
disabled: false
docker_tag: ann-benchmarks-aerospike
module: ann_benchmarks.algorithms.aerospike
name: aerospike
run_groups:
cosine:
args: [
[cosine],
[{m: 16, ef_construction: 100, ef: 100}]
]
query_args: [
[]
]
euclidean:
- base_args: ['@metric', '@dimension']
constructor: Aerospike
disabled: false
docker_tag: ann-benchmarks-aerospike
module: ann_benchmarks.algorithms.aerospike
name: aerospike
run_groups:
SQUARED_EUCLIDEAN:
args: [
[SQUARED_EUCLIDEAN], #Idx Type
[{m: 16, ef_construction: 100, ef: 100}]
]
query_args: [
[]
]

0 comments on commit 3b056fb

Please sign in to comment.