Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add size to index-stats API response #492

Open
wants to merge 2 commits into
base: mainline
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/marqo/tensor_search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,6 @@
NON_OFFICIAL_LUCENE_SPECIAL_CHARS = {
' '
}

NUM_BYTES_IN_KB = 1024
SUPPORTED_SIZES_FOR_STATS = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB']
11 changes: 10 additions & 1 deletion src/marqo/tensor_search/tensor_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,17 @@ def _autofill_index_settings(index_settings: dict):

def get_stats(config: Config, index_name: str):
doc_count = HttpRequests(config).post(path=F"{index_name}/_count")["count"]
index_stats = HttpRequests(config).get(path=F"{index_name}/_stats")["indices"]
size_in_bytes = None
try:
size_in_bytes = index_stats[index_name]["total"]["store"]["size_in_bytes"]
except AttributeError:
raise errors.IndexNotFoundError(message="Tried to get a non-existent index: {}".format(index_name))

formatted_size = utils.convert_bytes_to_human_readable_format(size_in_bytes)
return {
"numberOfDocuments": doc_count
"numberOfDocuments": doc_count,
"size": formatted_size
}


Expand Down
7 changes: 7 additions & 0 deletions src/marqo/tensor_search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import typing
import functools
import json
import math
from timeit import default_timer as timer
import torch
from marqo import errors
Expand Down Expand Up @@ -349,3 +350,9 @@ def is_tensor_field(field: str,
return field in tensor_fields
else:
return field not in non_tensor_fields


def convert_bytes_to_human_readable_format(size_in_bytes: int) -> str:
size_factor = math.floor(math.log(size_in_bytes) / math.log(constants.NUM_BYTES_IN_KB))
processed_size = size_in_bytes / math.pow(constants.NUM_BYTES_IN_KB, size_factor)
return f"{processed_size:.2f} {constants.SUPPORTED_SIZES_FOR_STATS[size_factor]}"
8 changes: 6 additions & 2 deletions tests/tensor_search/test_get_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ def test_get_stats_empty(self):
except IndexNotFoundError as s:
pass
tensor_search.create_vector_index(config=self.config, index_name=self.index_name_1)
assert tensor_search.get_stats(config=self.config, index_name=self.index_name_1)["numberOfDocuments"] == 0
index_stats = tensor_search.get_stats(config=self.config, index_name=self.index_name_1)
assert index_stats["numberOfDocuments"] == 0
assert len(index_stats["size"]) != 0

def test_get_stats_non_empty(self):
try:
Expand All @@ -35,4 +37,6 @@ def test_get_stats_non_empty(self):
auto_refresh=True, device="cpu"
)
)
assert tensor_search.get_stats(config=self.config, index_name=self.index_name_1)["numberOfDocuments"] == 3
index_stats = tensor_search.get_stats(config=self.config, index_name=self.index_name_1)
assert index_stats["numberOfDocuments"] == 3
assert len(index_stats["size"]) != 0
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I saw that the computed size of the documents uploaded in this test was changing on each run; so figured len check would be a good assertion here, and added unit-tests for the bytes to human readable format function here

19 changes: 19 additions & 0 deletions tests/tensor_search/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,3 +398,22 @@ def test_is_tensor_field_providing_one_empty(self):
non_tensor_fields = []
with self.assertRaises(errors.InternalError):
utils.is_tensor_field('field1', tensor_fields=tensor_fields, non_tensor_fields=non_tensor_fields)

def test_convert_bytes_to_human_readable_format(self):
size_in_bytes = 1000 # 1000 B
assert utils.convert_bytes_to_human_readable_format(size_in_bytes) == "1000.00 B"

size_in_bytes = 16121 # 15.74 KB
assert utils.convert_bytes_to_human_readable_format(size_in_bytes) == "15.74 KB"

size_in_bytes = 9874321 # 9.42 MB
assert utils.convert_bytes_to_human_readable_format(size_in_bytes) == "9.42 MB"

size_in_bytes = 10000000000 # 9.31 GB
assert utils.convert_bytes_to_human_readable_format(size_in_bytes) == "9.31 GB"

size_in_bytes = 712893712304234 # 648.37 TB
assert utils.convert_bytes_to_human_readable_format(size_in_bytes) == "648.37 TB"

size_in_bytes = 6212893712323224 # 5.52 PB
assert utils.convert_bytes_to_human_readable_format(size_in_bytes) == "5.52 PB"