From 43e08622b6830cc78ce0b17e686ac9ba0c51e8cd Mon Sep 17 00:00:00 2001 From: randersenyb Date: Sat, 20 Jul 2024 21:55:27 -0700 Subject: [PATCH] Updated bashboard, fixed command line arguments to reflect the associated commands, added support for sklearn recall... --- .vscode/launch.json | 43 ++- aerospike/AerospikeHDFDashboard.json | 514 ++++++++++++++++++++++----- aerospike/aerospikehdf.py | 248 ++++++++++--- aerospike/baseaerospike.py | 207 +++++------ aerospike/bigann/metrics.py | 21 +- aerospike/metrics.py | 94 ++--- 6 files changed, 785 insertions(+), 342 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 9753e984..e5347da6 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -62,7 +62,7 @@ "console": "integratedTerminal" }, { - "name": "Python Debugger: hdf_import", + "name": "Python Debugger: hdf_import (prompt DS)", "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/aerospike/hdf_import.py", @@ -78,7 +78,7 @@ "console": "integratedTerminal" }, { - "name": "Python Debugger: hdf_import (prompt DS)", + "name": "Python Debugger: hdf_import (prompt HDF)", "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/aerospike/hdf_import.py", @@ -137,7 +137,20 @@ "console": "integratedTerminal" }, { - "name": "Python Debugger: hdf_query", + "name": "Python Debugger: hdf_query (prompt DS)", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/aerospike/hdf_query.py", + "cwd": "${workspaceFolder}/aerospike", + "args": [ + "--dataset", "${input:enterDataset}", + "--logfile", "./hdfquery.log", + "-r", "10" + ], + "console": "integratedTerminal" + }, + { + "name": "Python Debugger: hdf_query (proimpt HDF)", "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/aerospike/hdf_query.py", @@ -150,7 +163,7 @@ "console": "integratedTerminal" }, { - "name": "Python Debugger: hdf_query (check prompt)", + "name": "Python Debugger: hdf_query (check prompt HDF)", "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/aerospike/hdf_query.py", @@ -171,7 +184,7 @@ "cwd": "${workspaceFolder}/aerospike", "args": [ "--dataset", "glove-100-angular", - "--logfile", "./hdfquery.log", + "--logfile", "./hdfquery-glove.log", "--check", "-r", "10", "--searchparams", "{\"ef\":10}" @@ -186,7 +199,7 @@ "cwd": "${workspaceFolder}/aerospike", "args": [ "--dataset", "glove-100-angular", - "--logfile", "./hdfquery.log", + "--logfile", "./hdfquery-glove.log", "--check", "-r", "10", "--limit", "10" @@ -202,14 +215,14 @@ "args": [ "--dataset", "gist-960-euclidean", "--logfile", "./hdfquery-gist1.log", - "-r", "1", - "--limit", "10", + "-r", "10", + "--limit", "100", "--idxname", "ANN-data_euclidean_SQUARED_EUCLIDEAN_960_16_100_100_Idx" ], "console": "integratedTerminal" }, { - "name": "Python Debugger: hdf_create_dataset (prompt)", + "name": "Python Debugger: hdf_create_dataset (prompt HDF)", "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/aerospike/hdf_create_dataset.py", @@ -247,14 +260,14 @@ "console": "integratedTerminal" }, { - "name": "Python Debugger: bigann_convert", + "name": "Python Debugger: bigann_convert (prompt DS)", "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/aerospike/bigann_convert_hdf.py", "cwd": "${workspaceFolder}/aerospike", "args": [ "--dataset", "${input:enterBigDataset}", - "--hdf", "bigtest.hdf5" + "--hdf", "${input:enterBigHDFFile}" ], "console": "integratedTerminal" }, @@ -271,12 +284,18 @@ "type": "promptString", "description": "Enter Big Dataset", "default": "random-xs" - }, + }, { "id": "enterHDFFile", "type": "promptString", "description": "Enter HDF Path", "default": "test" + }, + { + "id": "enterBigHDFFile", + "type": "promptString", + "description": "Enter HDF Path", + "default": "bigtest" } ] } \ No newline at end of file diff --git a/aerospike/AerospikeHDFDashboard.json b/aerospike/AerospikeHDFDashboard.json index 7f278fc2..1c90073c 100644 --- a/aerospike/AerospikeHDFDashboard.json +++ b/aerospike/AerospikeHDFDashboard.json @@ -70,7 +70,7 @@ } ] }, - "description": "Aerospike HDF ", + "description": "Aerospike HDF", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, @@ -132,10 +132,10 @@ "orientation": "auto", "reduceOptions": { "calcs": [ - "firstNotNull" + "lastNotNull" ], - "fields": "/^dataset$/", - "values": false + "fields": "dataset", + "values": true }, "showPercentChange": false, "textMode": "value", @@ -151,7 +151,7 @@ "disableTextWrap": false, "editorMode": "builder", "exemplar": false, - "expr": "aerospike_hdf_heartbeat{instance=~\"$client\", ns=\"$namespace\", set=\"$set\"}", + "expr": "aerospike_hdf_heartbeat{instance=~\"$client\"}", "format": "table", "fullMetaSearch": false, "includeNullMetadata": true, @@ -207,10 +207,10 @@ "orientation": "auto", "reduceOptions": { "calcs": [ - "firstNotNull" + "lastNotNull" ], - "fields": "/^idxdist$/", - "values": false + "fields": "idxdist", + "values": true }, "showPercentChange": false, "textMode": "value", @@ -226,7 +226,7 @@ "disableTextWrap": false, "editorMode": "builder", "exemplar": false, - "expr": "aerospike_hdf_heartbeat{instance=~\"$client\", idxns=\"$idxns\", idx=\"$idxname\"}", + "expr": "aerospike_hdf_heartbeat{instance=~\"$client\"}", "format": "table", "fullMetaSearch": false, "includeNullMetadata": true, @@ -282,10 +282,10 @@ "orientation": "auto", "reduceOptions": { "calcs": [ - "firstNotNull" + "lastNotNull" ], - "fields": "/^poprecs$/", - "values": false + "fields": "poprecs", + "values": true }, "showPercentChange": false, "textMode": "value", @@ -301,7 +301,7 @@ "disableTextWrap": false, "editorMode": "builder", "exemplar": false, - "expr": "aerospike_hdf_heartbeat{instance=~\"$client\", ns=\"$namespace\", set=\"$set\"}", + "expr": "aerospike_hdf_heartbeat{instance=~\"$client\"}", "format": "table", "fullMetaSearch": false, "includeNullMetadata": true, @@ -357,10 +357,10 @@ "orientation": "auto", "reduceOptions": { "calcs": [ - "firstNotNull" + "lastNotNull" ], - "fields": "/^dims$/", - "values": false + "fields": "dims", + "values": true }, "showPercentChange": false, "textMode": "value", @@ -376,7 +376,7 @@ "disableTextWrap": false, "editorMode": "builder", "exemplar": false, - "expr": "aerospike_hdf_heartbeat{instance=~\"$client\", ns=\"$namespace\", set=\"$set\"}", + "expr": "aerospike_hdf_heartbeat{instance=~\"$client\"}", "format": "table", "fullMetaSearch": false, "includeNullMetadata": true, @@ -433,8 +433,8 @@ "calcs": [ "lastNotNull" ], - "fields": "/^hnswparams$/", - "values": false + "fields": "hnswparams", + "values": true }, "showPercentChange": false, "textMode": "value", @@ -450,7 +450,7 @@ "disableTextWrap": false, "editorMode": "builder", "exemplar": false, - "expr": "aerospike_hdf_heartbeat{instance=~\"$client\", ns=\"$namespace\", set=\"$set\"}", + "expr": "aerospike_hdf_heartbeat{instance=~\"$client\"}", "format": "table", "fullMetaSearch": false, "includeNullMetadata": true, @@ -464,6 +464,255 @@ "title": "HNSW Params", "type": "stat" }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "red", + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#6ED0E0", + "value": 100 + }, + { + "color": "#EAB839", + "value": 500 + }, + { + "color": "#EF843C", + "value": 750 + }, + { + "color": "#E24D42", + "value": 1000 + } + ] + }, + "unit": "locale" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 662, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [], + "fields": "", + "values": true + }, + "showPercentChange": false, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "sum(aerospike_hdf_exception{instance=~\"$client\"})", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Errors", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The name of the Aerospike Namespace ans Set where the vector values are stored", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "semi-dark-blue", + "mode": "fixed" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "string" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 7, + "x": 0, + "y": 6 + }, + "id": 663, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [], + "fields": "/^fullname$/", + "values": true + }, + "showPercentChange": false, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "label_join(aerospike_hdf_heartbeat{instance=~\"$client\"}, \"fullname\", \".\", \"ns\",\"set\")", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Aerospike Set", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The location of where the vector index is stored within the Aerospike DB. ", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "string" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 9, + "x": 7, + "y": 6 + }, + "id": 664, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [], + "fields": "/^fullname$/", + "values": true + }, + "showPercentChange": false, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "label_join(aerospike_hdf_heartbeat{instance=~\"$client\"}, \"fullname\", \".\", \"idxns\",\"idx\")", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Vector Index", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 660, + "panels": [], + "title": "Exceptions", + "type": "row" + }, { "datasource": { "type": "prometheus", @@ -527,7 +776,7 @@ "h": 13, "w": 24, "x": 0, - "y": 6 + "y": 10 }, "id": 640, "options": { @@ -592,7 +841,7 @@ "h": 1, "w": 24, "x": 0, - "y": 19 + "y": 23 }, "id": 651, "panels": [], @@ -655,7 +904,7 @@ "h": 5, "w": 3, "x": 0, - "y": 20 + "y": 24 }, "id": 649, "options": { @@ -664,9 +913,11 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": [], - "fields": "/^paused$/", - "values": false + "calcs": [ + "lastNotNull" + ], + "fields": "paused", + "values": true }, "showPercentChange": false, "textMode": "auto", @@ -730,7 +981,7 @@ "h": 5, "w": 3, "x": 3, - "y": 20 + "y": 24 }, "id": 626, "options": { @@ -740,7 +991,7 @@ "reduceOptions": { "calcs": [], "fields": "", - "values": false + "values": true }, "showThresholdLabels": false, "showThresholdMarkers": false, @@ -906,7 +1157,7 @@ "h": 9, "w": 24, "x": 0, - "y": 25 + "y": 29 }, "id": 638, "options": { @@ -970,7 +1221,7 @@ "h": 1, "w": 24, "x": 0, - "y": 34 + "y": 38 }, "id": 652, "panels": [], @@ -1009,7 +1260,7 @@ "h": 5, "w": 3, "x": 0, - "y": 35 + "y": 39 }, "id": 646, "options": { @@ -1021,8 +1272,8 @@ "calcs": [ "firstNotNull" ], - "fields": "/^queryruns$/", - "values": false + "fields": "queryruns", + "values": true }, "showPercentChange": false, "textMode": "value", @@ -1084,7 +1335,7 @@ "h": 5, "w": 3, "x": 3, - "y": 35 + "y": 39 }, "id": 650, "options": { @@ -1096,8 +1347,8 @@ "calcs": [ "firstNotNull" ], - "fields": "/^queries$/", - "values": false + "fields": "queries", + "values": true }, "showPercentChange": false, "textMode": "value", @@ -1159,7 +1410,7 @@ "h": 5, "w": 3, "x": 6, - "y": 35 + "y": 39 }, "id": 647, "options": { @@ -1171,8 +1422,8 @@ "calcs": [ "firstNotNull" ], - "fields": "/^querynbrlmt$/", - "values": false + "fields": "querynbrlmt", + "values": true }, "showPercentChange": false, "textMode": "value", @@ -1233,7 +1484,7 @@ "h": 5, "w": 2, "x": 9, - "y": 35 + "y": 39 }, "id": 659, "options": { @@ -1242,11 +1493,9 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "/^queryef$/", - "values": false + "calcs": [], + "fields": "queryef", + "values": true }, "showPercentChange": false, "textMode": "value", @@ -1262,7 +1511,7 @@ "disableTextWrap": false, "editorMode": "builder", "exemplar": false, - "expr": "aerospike_hdf_heartbeat{instance=~\"$client\", ns=\"$namespace\", set=\"$set\"}", + "expr": "aerospike_hdf_heartbeat{instance=~\"$client\", ns=\"$idxns\", idx=\"$idxname\"}", "format": "table", "fullMetaSearch": false, "includeNullMetadata": true, @@ -1309,7 +1558,7 @@ "h": 5, "w": 3, "x": 11, - "y": 35 + "y": 39 }, "id": 648, "options": { @@ -1319,7 +1568,7 @@ "reduceOptions": { "calcs": [], "fields": "", - "values": false + "values": true }, "showThresholdLabels": false, "showThresholdMarkers": false, @@ -1429,7 +1678,7 @@ "h": 5, "w": 2, "x": 14, - "y": 35 + "y": 39 }, "id": 653, "options": { @@ -1438,9 +1687,11 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": [], - "fields": "/^querymetric$/", - "values": false + "calcs": [ + "firstNotNull" + ], + "fields": "querymetric", + "values": true }, "showPercentChange": false, "text": {}, @@ -1477,7 +1728,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "The recall as calculated based on the ANN neighbors result set", + "description": "This recall is calculated based on the distance between neighbors based on the ANN formulas.", "fieldConfig": { "defaults": { "color": { @@ -1516,9 +1767,9 @@ }, "gridPos": { "h": 5, - "w": 3, + "w": 2, "x": 16, - "y": 35 + "y": 39 }, "id": 654, "options": { @@ -1527,9 +1778,102 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": [], - "fields": "/^querymetricvalue$/", - "values": false + "calcs": [ + "firstNotNull" + ], + "fields": "querymetricvalue", + "values": true + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "aerospike_hdf_heartbeat{idx=\"$idxname\", instance=~\"$client\", idxns=\"$idxns\"}", + "format": "table", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "B", + "useBackend": false + } + ], + "title": "Recall (Distance)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "This recall is calculated based on the distance as returned by the Aerospike API using the ANN formulas.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.5 + }, + { + "color": "yellow", + "value": 0.6 + }, + { + "color": "green", + "value": 0.7 + }, + { + "color": "dark-green", + "value": 0.8 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 18, + "y": 39 + }, + "id": 661, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "firstNotNull" + ], + "fields": "querymetricaerospikevalue", + "values": true }, "showPercentChange": false, "text": {}, @@ -1558,7 +1902,7 @@ "useBackend": false } ], - "title": "Recall (ANN Calculated)", + "title": "Recall (Aerospike Distance)", "type": "stat" }, { @@ -1566,7 +1910,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "The recall based on the distance returned from the Aerospike Vector client API", + "description": "This recall is calculated using the Big ANN's KNN neighbor formulas (distance is not used).", "fieldConfig": { "defaults": { "color": { @@ -1606,8 +1950,8 @@ "gridPos": { "h": 5, "w": 3, - "x": 19, - "y": 35 + "x": 21, + "y": 39 }, "id": 657, "options": { @@ -1616,9 +1960,11 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": [], - "fields": "/^querymetricaerospikevalue$/", - "values": false + "calcs": [ + "firstNotNull" + ], + "fields": "/^querymetricbigvalue$/", + "values": true }, "showPercentChange": false, "text": {}, @@ -1647,7 +1993,7 @@ "useBackend": false } ], - "title": "Recall (Aerospike)", + "title": "KNN Recall (Nbors)", "type": "stat" }, { @@ -1713,7 +2059,7 @@ "h": 13, "w": 24, "x": 0, - "y": 40 + "y": 44 }, "id": 610, "options": { @@ -1810,7 +2156,7 @@ "h": 15, "w": 24, "x": 0, - "y": 53 + "y": 57 }, "id": 656, "options": { @@ -1873,8 +2219,8 @@ { "current": { "selected": true, - "text": "default", - "value": "default" + "text": "prometheus", + "value": "edj97jcoudpfkd" }, "hide": 0, "includeAll": false, @@ -1907,7 +2253,7 @@ "query": "label_values(aerospike_hdf_heartbeat,instance)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -1931,7 +2277,7 @@ "query": "label_values(aerospike_hdf_heartbeat,ns)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -1943,7 +2289,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "definition": "label_values(aerospike_hdf_heartbeat,set)", + "definition": "label_values(aerospike_hdf_heartbeat{ns=\"$namespace\"},set)", "hide": 0, "includeAll": true, "label": "Set", @@ -1952,10 +2298,10 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(aerospike_hdf_heartbeat,set)", + "query": "label_values(aerospike_hdf_heartbeat{ns=\"$namespace\"},set)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -1979,7 +2325,7 @@ "query": "label_values(aerospike_hdf_heartbeat,idxns)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -1991,7 +2337,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "definition": "label_values(aerospike_hdf_heartbeat,idx)", + "definition": "label_values(aerospike_hdf_heartbeat{idxns=\"$idxns\"},idx)", "hide": 0, "includeAll": true, "label": "Index name", @@ -2000,10 +2346,10 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(aerospike_hdf_heartbeat,idx)", + "query": "label_values(aerospike_hdf_heartbeat{idxns=\"$idxns\"},idx)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -2015,7 +2361,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "definition": "label_values(aerospike_hdf_query_total,run)", + "definition": "label_values(aerospike_hdf_query_total{idx=\"$idxname\", ns=\"$idxns\"},run)", "hide": 0, "includeAll": true, "label": "Query Run", @@ -2024,10 +2370,10 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(aerospike_hdf_query_total,run)", + "query": "label_values(aerospike_hdf_query_total{idx=\"$idxname\", ns=\"$idxns\"},run)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 3, @@ -2036,7 +2382,7 @@ ] }, "time": { - "from": "now-3h", + "from": "now-5m", "to": "now" }, "timepicker": { @@ -2067,6 +2413,6 @@ "timezone": "", "title": "Aerospike HDF", "uid": "fzUPYeJIkhdf", - "version": 60, + "version": 12, "weekStart": "" } \ No newline at end of file diff --git a/aerospike/aerospikehdf.py b/aerospike/aerospikehdf.py index da85712f..8d00849a 100644 --- a/aerospike/aerospikehdf.py +++ b/aerospike/aerospikehdf.py @@ -4,6 +4,7 @@ import logging import argparse import statistics +import json from enum import Flag, auto from typing import Iterable, List, Union, Any @@ -29,21 +30,21 @@ def parse_arguments_population(parser: argparse.ArgumentParser) -> None: ''' Adds the arguments required to populate an index. ''' - if len([x.dest for x in parser._actions if "dataset" == x.dest]) == 0: - parser.add_argument( - '-d', "--dataset", - metavar="DS", - help="the dataset to load training points from", - default="glove-100-angular", - choices=DATASETS.keys(), - ) - parser.add_argument( - "--hdf", - metavar="HDFFILE", - help="A HDF file that will be the dataset to load training points from... Defaults to 'data' folder", - default=None, - type=str - ) + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument( + '-d', "--dataset", + metavar="DS", + help="the dataset to load training points from", + default="glove-100-angular", + choices=DATASETS.keys(), + ) + group.add_argument( + "--hdf", + metavar="HDFFILE", + help="A HDF file that will be the dataset to load training points from... Defaults to 'data' folder", + default=None, + type=str + ) parser.add_argument( '-c', "--concurrency", metavar="N", @@ -57,7 +58,58 @@ def parse_arguments_population(parser: argparse.ArgumentParser) -> None: - > 1 -- The number of records upserted, concurrently (async), before the app waits for the upserts to complete. ''', default=500, + ) + parser.add_argument( + '-n', "--namespace", + metavar="NS", + help="The Aerospike Namespace", + default="test", + ) + parser.add_argument( + '-s', "--setname", + metavar="SET", + help="The Aerospike Set Name. Will default to the dataset name", + default=None, + ) + parser.add_argument( + '-N', "--idxnamespace", + metavar="NS", + help="Aerospike Namespace where the Vector Idx will be located. Defaults to --Namespace", + default=None, + type=str + ) + parser.add_argument( + '-I', "--idxname", + metavar="IDX", + help="The Vector Index Name. Defaults to the DataSet Name with the suffix of '_idx'", + default=None, + ) + parser.add_argument( + '-b', "--vectorbinname", + metavar="BIN", + help="The Aerospike Bin Name where the Vector is stored", + default="HDF_embedding", + ) + parser.add_argument( + '-g', "--generatedetailsetname", + help="Generates a Set name based on distance type, dimensions, index params, etc.", + action='store_true' ) + parser.add_argument( + '-D', "--distancetype", + metavar="DIST", + help="The Vector's Index Distance Type. The default is to select the type based on the dataset", + type=vectorTypes.VectorDistanceMetric, + choices=list(vectorTypes.VectorDistanceMetric), + default=None + ) + parser.add_argument( + '-P', "--indexparams", + metavar="PARM", + type=json.loads, + help="The Vector's Index Params (HnswParams)", + default='{"m": 16, "ef_construction": 100, "ef": 100}' + ) parser.add_argument( "--idxdrop", help="If the Vector Index existence, it will be dropped. Otherwise is is updated.", @@ -97,21 +149,41 @@ def parse_arguments_query(parser: argparse.ArgumentParser) -> None: ''' Adds the arguments required to query an index. ''' - if len([x.dest for x in parser._actions if "dataset" == x.dest]) == 0: - parser.add_argument( - '-d', "--dataset", - metavar="DS", - help="the dataset to load the search points from", - default="glove-100-angular", - choices=DATASETS.keys(), - ) - parser.add_argument( - "--hdf", - metavar="HDFFILE", - help="A HDF file that will be the dataset to load search points from... Defaults to 'data' folder", - default=None, - type=str - ) + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument( + '-d', "--dataset", + metavar="DS", + help="the dataset to load the search points from", + default="glove-100-angular", + choices=DATASETS.keys(), + ) + group.add_argument( + "--hdf", + metavar="HDFFILE", + help="A HDF file that will be the dataset to load search points from... Defaults to 'data' folder", + default=None, + type=str + ) + parser.add_argument( + '-N', "--idxnamespace", + metavar="NS", + help="Aerospike Namespace where the Vector Idx will be located. Defaults to 'test'", + default="test", + type=str + ) + parser.add_argument( + '-I', "--idxname", + metavar="IDX", + help="The Vector Index Name. If not provided, it defaults to the dataset name with a suffix of '_idx'", + default=None + ) + parser.add_argument( + '-S', "--searchparams", + metavar="PARM", + type=json.loads, + help="The Vector's Search Params (HnswSearchParams)", + default=None + ) parser.add_argument( '-r', "--runs", metavar="RUNS", @@ -149,9 +221,9 @@ def parse_arguments_query(parser: argparse.ArgumentParser) -> None: type=str, choices=METRICS.keys(), ) - + BaseAerospike.parse_arguments(parser) - + def __init__(self, runtimeArgs: argparse.Namespace, actions: OperationActions): super().__init__(runtimeArgs, logger) @@ -180,8 +252,38 @@ def __init__(self, runtimeArgs: argparse.Namespace, actions: OperationActions): self._idx_resource_event = runtimeArgs.exhaustedevt self._idx_resource_cnt = 0 self._idx_maxrecs = runtimeArgs.maxrecs - + self._namespace = runtimeArgs.namespace + if runtimeArgs.idxnamespace is None or not runtimeArgs.idxnamespace: + self._idx_namespace = self._namespace + else: + self._idx_namespace = runtimeArgs.idxnamespace + + if runtimeArgs.setname is None or not runtimeArgs.setname: + self._setName = self._datasetname + else: + self._setName = runtimeArgs.setname + self._paramsetname = runtimeArgs.generatedetailsetname + if runtimeArgs.idxname is None or not runtimeArgs.idxname: + self._idx_name = f'{self._datasetname}_Idx' + else: + self._idx_name = runtimeArgs.idxname + self._idx_binName = runtimeArgs.vectorbinname + + self._idx_distance = runtimeArgs.distancetype + if runtimeArgs.indexparams is None or len(runtimeArgs.indexparams) == 0: + self._idx_hnswparams = None + else: + self._idx_hnswparams = BaseAerospike.set_hnsw_params_attrs( + vectorTypes.HnswParams(), + runtimeArgs.indexparams + ) + if OperationActions.QUERY in actions: + self._idx_namespace = runtimeArgs.idxnamespace + if runtimeArgs.idxname is None or not runtimeArgs.idxname: + self._idx_name = f'{self._datasetname}_Idx' + else: + self._idx_name = runtimeArgs.idxname self._query_runs = runtimeArgs.runs self._query_parallel = runtimeArgs.parallel self._query_check = runtimeArgs.check @@ -195,6 +297,14 @@ def __init__(self, runtimeArgs: argparse.Namespace, actions: OperationActions): self._query_distance_aerospike : List = None self._query_latencies : List[float] = None self._query_produce_resultfile : bool = not runtimeArgs.noresultfile + + if runtimeArgs.searchparams is None or len(runtimeArgs.searchparams) == 0: + self._query_hnswparams = None + else: + self._query_hnswparams = BaseAerospike.set_hnsw_params_attrs( + vectorTypes.HnswSearchParams(), + runtimeArgs.searchparams + ) async def __aenter__(self): return self @@ -224,6 +334,12 @@ async def get_dataset(self) -> None: if self._idx_distance is None or not self._idx_distance: raise ValueError(f"Distance Map '{distance}' was not found.") + if self._dataset.attrs.get('sourceidxname', None) is not None: + if (self._idx_name != self._dataset.attrs['sourceidxname'] + or self._idx_namespace != self._dataset.attrs['sourceisxnamespace']): + self.print_log(f"Index Name doesn't match source idx name in HDF file. Names are: {self._idx_namespace}.{self._idx_name} != {self._dataset.attrs['sourceisxnamespace']}.{ self._dataset.attrs['sourceidxname']}", + logging.WARN) + if self._paramsetname: if self._idx_distance.casefold() == distance.casefold(): setNameType = self._idx_distance @@ -232,8 +348,8 @@ async def get_dataset(self) -> None: self._setName = f'{self._setName}_{setNameType}_{self._dimensions}_{self._idx_hnswparams.m}_{self._idx_hnswparams.ef_construction}_{self._idx_hnswparams.ef}' self._idx_name = f'{self._setName}_Idx' - self._canchecknbrs = self._neighbors is not None and len(self._neighbors) > 0 - if self._canchecknbrs and (self._query_nbrlimit is None or self._query_nbrlimit <= 0 or self._query_nbrlimit > len(self._neighbors[0])): + self._canchecknbors = self._neighbors is not None and len(self._neighbors) > 0 + if self._canchecknbors and (self._query_nbrlimit is None or self._query_nbrlimit <= 0 or self._query_nbrlimit > len(self._neighbors[0])): self._query_nbrlimit = len(self._neighbors[0]) self._remainingrecs = 0 @@ -256,8 +372,9 @@ async def get_dataset(self) -> None: else: self._pk_consecutivenbrs = False - self.prometheus_status(0) - self.print_log(f'get_dataset Exit: {self}, {self._ann_distance}, {self._idx_distance}, Train Array: {len(self._trainarray)}, Query Array: {len(self._queryarray)}, Distance: {distance}, Dimensions: {self._dimensions}, Neighbors: {0 if self._neighbors is None else len(self._neighbors)}, Distances: {0 if self._distances is None else len(self._distances)}, PK array: {0 if self._pks is None else len(self._pks)}, PK consistence: {self._pk_consecutivenbrs}, Neighbors Check: {self._canchecknbrs}') + self._heartbeat_stage = 1 + self.prometheus_status() + self.print_log(f'get_dataset Exit: {self}, {self._ann_distance}, {self._idx_distance}, Train Array: {len(self._trainarray)}, Query Array: {len(self._queryarray)}, Distance: {distance}, Dimensions: {self._dimensions}, Neighbors: {0 if self._neighbors is None else len(self._neighbors)}, Distances: {0 if self._distances is None else len(self._distances)}, PK array: {0 if self._pks is None else len(self._pks)}, PK consistence: {self._pk_consecutivenbrs}, Neighbors Check: {self._canchecknbors}') async def drop_index(self, adminClient: vectorASyncAdminClient) -> None: self.print_log(f'Dropping Index {self._idx_namespace}.{self._idx_name}') @@ -407,7 +524,10 @@ async def populate(self) -> None: self._trainarray = self._trainarray.astype(np.float32) self.print_log(f'populate: {self} Shape: {self._trainarray.shape}') - + + self._heartbeat_stage = 2 + self.prometheus_status() + async with vectorASyncAdminClient(seeds=self._host, listener_name=self._listern, is_loadbalancer=self._useloadbalancer @@ -607,12 +727,20 @@ async def query(self) -> None: self.print_log(f'Found Index {self._idx_namespace}.{self._idx_name} with Info {idxinfo}') self._idx_hnswparams = BaseAerospike.set_hnsw_params_attrs(vectorTypes.HnswParams(), - idxinfo) + idxinfo['hnsw_params']) + self._idx_binName = idxinfo["field"] + self._setName = idxinfo["setFilter"] + self._namespace = idxinfo["id"]["namespace"] + if self._query_hnswparams is None and self._idx_hnswparams is not None: + self._query_hnswparams = vectorTypes.HnswSearchParams(ef=self._idx_hnswparams.ef) + + self._heartbeat_stage = 2 + self.prometheus_status() self.print_log(f'Starting Query Runs ({self._query_runs}) on {self._idx_namespace}.{self._idx_name}') metricfunc = None distancemetric : DistanceMetric= None - if self._canchecknbrs: + if self._canchecknbors: metricfunc = None if self._query_metric is None else self._query_metric["function"] distancemetric = DISTANCES[self._ann_distance] @@ -649,12 +777,12 @@ async def query(self) -> None: self._aerospike_metric_value = metricfunc(self._distances, self._query_distance_aerospike, self._query_metric_aerospike_result, i-1, len(self._query_distance_aerospike[0])) metricValuesAS.append(self._aerospike_metric_value) - if len(self._query_neighbors) == 0: + if len(self._query_neighbors) == 0 or not self._canchecknbors: self._query_metric_big_value = None else: self._query_metric_big_value = bigknn((self._neighbors,self._distances), self._query_neighbors, len(self._query_neighbors[0]), self._query_metric_bigann_result).attrs["mean"] metricValuesBig.append(self._query_metric_big_value) - + self._logger.info(f"Run: {i}, Neighbors: {len(self._query_neighbors)}, {self._query_metric['type']}: {self._query_metric_value}, aerospike recall: {self._aerospike_metric_value}, Big: {self._query_metric_big_value}") i += 1 @@ -664,11 +792,12 @@ async def query(self) -> None: i = 0 totalquerytime = 0.0 for rundist, runasdist, runnns, times in queries: - if len(rundist) > 0: - metricValues.append(metricfunc(self._distances, rundist, self._query_metric_result, i, len(rundist[0]))) - if len(runnns) > 0: - metricValuesBig.append(bigknn((self._neighbors,self._distances), runnns, len(runnns[0]), self._query_metric_bigann_result).attrs["mean"]) - metricValuesAS.append(metricfunc(self._distances, runasdist, self._query_metric_aerospike_result, i, len(runasdist[0]))) + if metricfunc is not None: + if len(rundist) > 0: + metricValues.append(metricfunc(self._distances, rundist, self._query_metric_result, i, len(rundist[0]))) + metricValuesAS.append(metricfunc(self._distances, runasdist, self._query_metric_aerospike_result, i, len(runasdist[0]))) + if len(runnns) > 0 and self._canchecknbors: + metricValuesBig.append(bigknn((self._neighbors,self._distances), runnns, len(runnns[0]), self._query_metric_bigann_result).attrs["mean"]) self._query_distance = rundist self._query_distance_aerospike = runasdist self._query_neighbors = runnns @@ -729,13 +858,22 @@ async def _check_query_distances(self, distances:List[float], distances_aerospik def _get_orginal_vector_from_pk(self, pk : any) -> Union[np.ndarray, List]: self._logger.debug(f"_get_orginal_vector_from_pk: pk:{pk}") - if self._pk_consecutivenbrs or self._pks is None: - return self._trainarray[pk] + try: + if self._pk_consecutivenbrs or self._pks is None: + return self._trainarray[pk] + + fndidx = self._pks.index(pk) + self._logger.debug(f"_get_orginal_vector_from_pk: {pk} idx:{fndidx}") + return self._trainarray[fndidx] + except IndexError as e: + self._logger.exception(f"pk: {pk}") + self._exception_counter.add(1, {"exception_type":f"PK index lookup Failed", "handled_by_user":True,"ns":self._idx_namespace,"set":self._idx_name}) + return np.zeros(len(self._trainarray[0])) + except ValueError as e: + self._logger.exception(f"pk: {pk}") + self._exception_counter.add(1, {"exception_type":f"PK value error lookup Failed", "handled_by_user":True,"ns":self._idx_namespace,"set":self._idx_name}) + return np.zeros(len(self._trainarray[0])) - fndidx = self._pks.index(pk) - self._logger.debug(f"_get_orginal_vector_from_pk: pk idx:{fndidx}") - return self._trainarray[fndidx] - async def query_run(self, client:vectorASyncClient, runNbr:int, distancemetric : DistanceMetric) -> tuple[List, List, List, List[float]]: ''' Returns a tuple of calculated distances, aerospike distances, neighbors, latency times @@ -772,7 +910,7 @@ async def query_run(self, client:vectorASyncClient, runNbr:int, distancemetric : self._exception_counter.add(1, {"exception_type":"No Query Results", "handled_by_user":False,"ns":self._idx_namespace,"set":self._idx_name,"run":runNbr}) logger.warn(f'No Query Results for {self._idx_namespace}.{self._idx_name}', logging.WARNING) msg = "Warn: No Results" - elif self._canchecknbrs and len(self._neighbors[len(rundistance)]) > 0: + elif self._canchecknbors and len(self._neighbors[len(rundistance)]) > 0: if not await self._check_query_neighbors(result_ids, len(rundistance), runNbr): msg = "Warn: Neighbor Compare Failed" else: @@ -791,7 +929,7 @@ async def query_run(self, client:vectorASyncClient, runNbr:int, distancemetric : msg += ", Distances don't match" rundistance.append(distances) except Exception as e: - msg = "Distance Calculation Failed: {e}" + msg = f"Distance Calculation Failed: {e}" self._logger.exception(f"Distance Calculation Failed Run: {runNbr}") self._exception_counter.add(1, {"exception_type":f"Distance Calculation Failed", "handled_by_user":True,"ns":self._idx_namespace,"set":self._idx_name,"run":runNbr}) rundistance.append([]) diff --git a/aerospike/baseaerospike.py b/aerospike/baseaerospike.py index 37a82176..7b3ce35b 100644 --- a/aerospike/baseaerospike.py +++ b/aerospike/baseaerospike.py @@ -3,7 +3,6 @@ import numpy as np import time import logging -import json import argparse from enum import Flag, auto @@ -75,65 +74,7 @@ def parse_arguments(parser: argparse.ArgumentParser) -> None: '-T', "--vectortls", help="Use TLS to connect to the Vector DB Server", action='store_true' - ) - parser.add_argument( - '-n', "--namespace", - metavar="NS", - help="The Aerospike Namespace", - default="test", - ) - parser.add_argument( - '-N', "--idxnamespace", - metavar="NS", - help="Aerospike Namespace where the Vector Idx will be located. Defaults to --Namespace", - default=None, - type=str - ) - parser.add_argument( - '-s', "--setname", - metavar="SET", - help="The Aerospike Set Name", - default="HDF-data", - ) - parser.add_argument( - '-I', "--idxname", - metavar="IDX", - help="The Vector Index Name. Defaults to the Set Name with the suffix of '_idx'", - default=None, - ) - parser.add_argument( - '-g', "--generatedetailsetname", - help="Generates a Set name based on distance type, dimensions, index params, etc.", - action='store_true' - ) - parser.add_argument( - '-b', "--vectorbinname", - metavar="BIN", - help="The Aerospike Bin Name where the Vector is stored", - default="HDF_embedding", - ) - parser.add_argument( - '-D', "--distancetype", - metavar="DIST", - help="The Vector's Index Distance Type. The default is to select the type based on the dataset", - type=vectorTypes.VectorDistanceMetric, - choices=list(vectorTypes.VectorDistanceMetric), - default=None - ) - parser.add_argument( - '-P', "--indexparams", - metavar="PARM", - type=json.loads, - help="The Vector's Index Params (HnswParams)", - default='{"m": 16, "ef_construction": 100, "ef": 100}' - ) - parser.add_argument( - '-S', "--searchparams", - metavar="PARM", - type=json.loads, - help="The Vector's Search Params (HnswSearchParams)", - default=None - ) + ) parser.add_argument( '-L', "--logfile", metavar="LOG", @@ -205,36 +146,17 @@ def __init__(self, runtimeArgs: argparse.Namespace, logger: logging.Logger): self._listern = None self._useloadbalancer = runtimeArgs.vectorloadbalancer - self._namespace = runtimeArgs.namespace - if runtimeArgs.idxnamespace is None or not runtimeArgs.idxnamespace: - self._idx_namespace = self._namespace - else: - self._idx_namespace = runtimeArgs.idxnamespace - self._setName = runtimeArgs.setname - self._paramsetname = runtimeArgs.generatedetailsetname - if runtimeArgs.idxname is None or not runtimeArgs.idxname: - self._idx_name = f'{self._setName}_Idx' - else: - self._idx_name = runtimeArgs.idxname - self._idx_binName = runtimeArgs.vectorbinname - - self._idx_distance = runtimeArgs.distancetype - if runtimeArgs.indexparams is None or len(runtimeArgs.indexparams) == 0: - self._idx_hnswparams = None - else: - self._idx_hnswparams = BaseAerospike.set_hnsw_params_attrs( - vectorTypes.HnswParams(), - runtimeArgs.indexparams - ) + self._namespace : str = None + self._idx_namespace : str = None + self._setName : str = None + self._paramsetname = None + self._idx_name : str = None + self._idx_binName : str = None - if runtimeArgs.searchparams is None or len(runtimeArgs.searchparams) == 0: - self._query_hnswparams = None - else: - self._query_hnswparams = BaseAerospike.set_hnsw_params_attrs( - vectorTypes.HnswSearchParams(), - runtimeArgs.searchparams - ) - + self._idx_distance = None + self._idx_hnswparams : vectorTypes.HnswParams = None + self._query_hnswparams : vectorTypes.HnswSearchParams = None + self._sleepexit = runtimeArgs.exitdelay self._actions : OperationActions = None self._waitidx : bool = None @@ -254,11 +176,12 @@ def __init__(self, runtimeArgs: argparse.Namespace, logger: logging.Logger): self._query_metric_big_value : float = None self._aerospike_metric_value : float = None self._query_metric : dict[str,any] = None - self._canchecknbrs : bool = False + self._canchecknbors : bool = False self._logging_init(runtimeArgs, logger) - - self._start_prometheus_heartbeat() + + self._heartbeat_stage = 0 + self._start_prometheus_heartbeat() def _prometheus_init(self, runtimeArgs: argparse.Namespace) -> None: @@ -355,7 +278,37 @@ def set_hnsw_params_attrs(__obj :object, __dict: dict) -> object: setattr(__obj, key, __dict[key]) return __obj - def prometheus_status(self, i:int, done:bool = False) -> None: + def prometheus_status(self, done:bool = False) -> None: + + self.__cnthb__ += 1 + + if self._heartbeat_stage == 0: + self._prometheus_heartbeat_gauge.set(self.__cnthb__, + {"paused": "Starting" + }) + return + if self._heartbeat_stage == 1: + attrs = {"dims": self._dimensions, + "poprecs": None if self._trainarray is None else len(self._trainarray), + "queries": None if self._queryarray is None else len(self._queryarray), + "querynbrlmt": self._query_nbrlimit, + "queryruns": self._query_runs, + "dataset":self._datasetname, + "paused":"Cellecting", + "action": None if self._actions is None else self._actions.name, + "hnswparams": self.hnswstr() + } + if self._namespace is not None: + attrs["ns"] = self._namespace + attrs["set"] = self._setName + if self._idx_namespace is not None: + attrs["idxns"] = self._idx_namespace + attrs["idx"] = self._idx_name + + self._prometheus_heartbeat_gauge.set(self.__cnthb__, + attrs) + return + pausestate : str = None if done: pausestate = "Done" @@ -368,45 +321,49 @@ def prometheus_status(self, i:int, done:bool = False) -> None: pausestate = "Running" else: pausestate = "Idle" + elif self._actions is not None and OperationActions.QUERY in self._actions: + pausestate = "Query" if self._query_hnswparams is None: queryef = '' if self._idx_hnswparams is None else str(self._idx_hnswparams.ef) else: queryef = self._query_hnswparams.ef - - self._prometheus_heartbeat_gauge.set(i, {"ns":self._namespace, - "set":self._setName, - "idxns":self._idx_namespace, - "idx":self._idx_name, - "idxbin":self._idx_binName, - "idxdist": None if self._idx_distance is None else self._idx_distance.name, - "dims": self._dimensions, - "poprecs": None if self._trainarray is None else len(self._trainarray), - "queries": None if self._queryarray is None else len(self._queryarray), - "querynbrlmt": self._query_nbrlimit, - "queryruns": self._query_runs, - "querycurrun": self._query_current_run, - "dataset":self._datasetname, - "paused": pausestate, - "action": None if self._actions is None else self._actions.name, - "remainingRecs" : self._remainingrecs, - "remainingquerynbrs" : self._remainingquerynbrs, - "querymetric": None if self._query_metric is None else self._query_metric["type"], - "querymetricvalue": self._query_metric_value, - "querymetricaerospikevalue": self._aerospike_metric_value, - "querymetricbigvalue": self._query_metric_big_value, - "hnswparams": self.hnswstr(), - "queryef": queryef - }) + + self._prometheus_heartbeat_gauge.set(self.__cnthb__, + {"ns":self._namespace, + "set":self._setName, + "idxns":self._idx_namespace, + "idx":self._idx_name, + "idxbin":self._idx_binName, + "idxdist": None if self._idx_distance is None else self._idx_distance.name, + "dims": self._dimensions, + "poprecs": None if self._trainarray is None else len(self._trainarray), + "queries": None if self._queryarray is None else len(self._queryarray), + "querynbrlmt": self._query_nbrlimit, + "queryruns": self._query_runs, + "querycurrun": self._query_current_run, + "dataset":self._datasetname, + "paused": pausestate, + "action": None if self._actions is None else self._actions.name, + "remainingRecs" : self._remainingrecs, + "remainingquerynbrs" : self._remainingquerynbrs, + "querymetric": None if self._query_metric is None else self._query_metric["type"], + "querymetricvalue": self._query_metric_value, + "querymetricaerospikevalue": self._aerospike_metric_value, + "querymetricbigvalue": self._query_metric_big_value, + "hnswparams": self.hnswstr(), + "queryef": queryef + }) def _prometheus_heartbeat(self) -> None: from time import sleep self._logger.debug(f"Heartbeating Start") i : int = 0 + self.__cnthb__ : int = 0 while self._prometheus_hb > 0: i += 1 - self.prometheus_status(i) + self.prometheus_status() sleep(self._prometheus_hb) self._logger.debug(f"Heartbeating Ended") @@ -436,7 +393,7 @@ def print_log(self, msg :str, logLevel :int = logging.INFO) -> None: async def shutdown(self, waitforcompletion:bool): if waitforcompletion and self._sleepexit > 0: - self.prometheus_status(0, True) + self.prometheus_status(True) self.print_log(f'existing sleeping {self._sleepexit}') self._prometheus_meter_provider.force_flush() self._prometheus_metric_reader.force_flush() @@ -484,8 +441,10 @@ def basestring(self) -> str: if self._idx_namespace == self._namespace: fullName = f"{self._namespace}.{self._setName}.{self._idx_name}" - else: - fullName = f"{self._namespace}.{self._setName}; {self._idx_namespace}.{self._idx_name}" + elif self._namespace is None: + fullName = f"{self._idx_namespace}.{self._idx_name}" + else: + fullName = f"{self._namespace}.{self._setName}.{self._idx_namespace}.{self._idx_name}" if self._host is None: hosts = "NoHosts" @@ -497,7 +456,9 @@ def basestring(self) -> str: def __str__(self): if self._idx_namespace == self._namespace: fullName = f"{self._namespace}.{self._setName}.{self._idx_name}" - else: + elif self._namespace is None: + fullName = f"{self._idx_namespace}.{self._idx_name}" + else: fullName = f"{self._namespace}.{self._setName}.{self._idx_namespace}.{self._idx_name}" return f"{fullName}({self._datasetname})" \ No newline at end of file diff --git a/aerospike/bigann/metrics.py b/aerospike/bigann/metrics.py index 7126ae18..53511f98 100644 --- a/aerospike/bigann/metrics.py +++ b/aerospike/bigann/metrics.py @@ -50,15 +50,12 @@ def get_recall_values(true_nn, run_nn, count, count_ties=True): queries_with_ties) def knn(true_nn, run_nn, count, metrics): - if 'knn' not in metrics: - print('Computing knn metrics') - knn_metrics = metrics.create_group('knn') - mean, std, recalls, queries_with_ties = get_recall_values(true_nn, run_nn, count) - if queries_with_ties>0: - print("Warning: %d/%d queries contained ties accounted for in recall" % (queries_with_ties, len(run_nn))) - knn_metrics.attrs['mean'] = mean - knn_metrics.attrs['std'] = std - knn_metrics['recalls'] = recalls - else: - print("Found cached result") - return metrics['knn'] + knn_metrics = metrics.create_group('knn') + mean, std, recalls, queries_with_ties = get_recall_values(true_nn, run_nn, count) + if queries_with_ties>0: + print("knn Warning: %d/%d queries contained ties accounted for in recall" % (queries_with_ties, len(run_nn))) + knn_metrics.attrs['mean'] = mean + knn_metrics.attrs['std'] = std + knn_metrics['recalls'] = recalls + + return metrics['knn'] \ No newline at end of file diff --git a/aerospike/metrics.py b/aerospike/metrics.py index 9eb2170b..ca33b4d5 100644 --- a/aerospike/metrics.py +++ b/aerospike/metrics.py @@ -1,7 +1,7 @@ from __future__ import absolute_import import numpy as np - +from typing import Iterable, List, Union, Any def knn_threshold(data, count, epsilon): return data[count - 1] + epsilon @@ -22,56 +22,42 @@ def get_recall_values(dataset_distances, run_distances, count, threshold, epsilo recalls[i] = actual return (np.mean(recalls) / float(count), np.std(recalls) / float(count), recalls) - def knn(dataset_distances, run_distances, count, metrics, epsilon=1e-3): - if "knn" not in metrics: - knn_metrics = metrics.create_group("knn") - mean, std, recalls = get_recall_values(dataset_distances, run_distances, count, knn_threshold, epsilon) - knn_metrics.attrs["mean"] = mean - knn_metrics.attrs["std"] = std - knn_metrics["recalls"] = recalls - else: - print("Found cached result") + knn_metrics = metrics.create_group("knn") + mean, std, recalls = get_recall_values(dataset_distances, run_distances, count, knn_threshold, epsilon) + knn_metrics.attrs["mean"] = mean + knn_metrics.attrs["std"] = std + knn_metrics["recalls"] = recalls + return metrics["knn"] +def sklearn_recall(true_neighbors : np.ndarray, run_neighbors : Union[np.ndarray, List]) -> float: + from sklearn.metrics import recall_score + from statistics import mean + + recallsores = [] + + for pos, truenbr in enumerate(true_neighbors): + runnbr = run_neighbors[pos] + recallsores.append(recall_score(truenbr, runnbr, average='weighted', zero_division=1)) + + return mean(recallsores) + + def epsilon(dataset_distances, run_distances, count, metrics, epsilon=0.01): s = "eps" + str(epsilon) - if s not in metrics: - epsilon_metrics = metrics.create_group(s) - mean, std, recalls = get_recall_values(dataset_distances, run_distances, count, epsilon_threshold, epsilon) - epsilon_metrics.attrs["mean"] = mean - epsilon_metrics.attrs["std"] = std - epsilon_metrics["recalls"] = recalls - else: - print("Found cached result") + epsilon_metrics = metrics.create_group(s) + mean, std, recalls = get_recall_values(dataset_distances, run_distances, count, epsilon_threshold, epsilon) + epsilon_metrics.attrs["mean"] = mean + epsilon_metrics.attrs["std"] = std + epsilon_metrics["recalls"] = recalls + return metrics[s] - -def rel(dataset_distances, run_distances, metrics): - if "rel" not in metrics.attrs: - total_closest_distance = 0.0 - total_candidate_distance = 0.0 - for true_distances, found_distances in zip(dataset_distances, run_distances): - total_closest_distance += np.sum(true_distances) - total_candidate_distance += np.sum(found_distances) - if total_closest_distance < 0.01: - metrics.attrs["rel"] = float("inf") - else: - metrics.attrs["rel"] = total_candidate_distance / total_closest_distance - else: - print("Found cached result") - return metrics.attrs["rel"] - - -def queries_per_second(queries, attrs): - return 1.0 / attrs["best_search_time"] - - def percentile_50(times): return np.percentile(times, 50.0) * 1000.0 - def percentile_95(times): return np.percentile(times, 95.0) * 1000.0 @@ -83,22 +69,18 @@ def percentile_99(times): def percentile_999(times): return np.percentile(times, 99.9) * 1000.0 - -def index_size(queries, attrs): - # TODO(erikbern): should replace this with peak memory usage or something - return attrs.get("index_size", 0) - - -def build_time(queries, attrs): - return attrs["build_time"] - - -def candidates(queries, attrs): - return attrs["candidates"] - - -def dist_computations(queries, attrs): - return attrs.get("dist_comps", 0) / (attrs["run_count"] * len(queries)) +def rel(dataset_distances, run_distances, metrics): + total_closest_distance = 0.0 + total_candidate_distance = 0.0 + for true_distances, found_distances in zip(dataset_distances, run_distances): + total_closest_distance += np.sum(true_distances) + total_candidate_distance += np.sum(found_distances) + if total_closest_distance < 0.01: + metrics.attrs["rel"] = float("inf") + else: + metrics.attrs["rel"] = total_candidate_distance / total_closest_distance + + return metrics.attrs["rel"] class DummyMetric: def __init__(self):