Skip to content

Commit

Permalink
Initial vector bulk hdf5 implementation (fix conflicts)
Browse files Browse the repository at this point in the history
Signed-off-by: Finn Roblin <[email protected]>
  • Loading branch information
finnroblin committed Sep 11, 2024
1 parent 5403700 commit d2cfd72
Show file tree
Hide file tree
Showing 7 changed files with 870 additions and 34 deletions.
15 changes: 14 additions & 1 deletion osbenchmark/utils/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from abc import ABC, ABCMeta, abstractmethod
from enum import Enum
from typing import cast

import h5py
import numpy as np

Expand Down Expand Up @@ -159,6 +158,20 @@ def parse_context(context: Context) -> str:

raise Exception("Unsupported context")

def context_string_to_context(context_string: str) -> Context:
if context_string == "neighbors":
return Context.NEIGHBORS
elif context_string == "train":
return Context.INDEX
elif context_string == "test":
return Context.QUERY
elif context_string == "max_distance_neighbors":
return Context.MAX_DISTANCE_NEIGHBORS
elif context_string == "min_score_neighbors":
return Context.MIN_SCORE_NEIGHBORS
else:
raise ValueError(f"Invalid context string: {context_string}")


class BigANNDataSet(DataSet):

Expand Down
1 change: 0 additions & 1 deletion osbenchmark/worker_coordinator/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,6 @@ async def __call__(self, opensearch, params):
if not detailed_results:
opensearch.return_raw_response()
request_context_holder.on_client_request_start()

if with_action_metadata:
api_kwargs.pop("index", None)
# only half of the lines are documents
Expand Down
15 changes: 15 additions & 0 deletions osbenchmark/workload/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1477,6 +1477,12 @@ def _create_corpora(self, corpora_specs, indices, data_streams):
default_value=workload.Documents.SOURCE_FORMAT_BULK)
default_action_and_meta_data = self._r(corpus_spec, "includes-action-and-meta-data", mandatory=False,
default_value=False)
default_generate_increasing_vector_ids = self._r(corpus_spec, "generate-increasing-vector-ids", mandatory=False,
default_value=False)
default_id_field_name = self._r(corpus_spec, "id-field-name", mandatory=False,
default_value=None)
default_vector_field_name = self._r(corpus_spec, "vector-field-name", mandatory=False,
default_value=None)
corpus_target_idx = None
corpus_target_ds = None
corpus_target_type = None
Expand Down Expand Up @@ -1518,6 +1524,12 @@ def _create_corpora(self, corpora_specs, indices, data_streams):

includes_action_and_meta_data = self._r(doc_spec, "includes-action-and-meta-data", mandatory=False,
default_value=default_action_and_meta_data)
generate_increasing_vector_ids = self._r(doc_spec, "generate-increasing-vector-ids", mandatory=False,
default_value=default_generate_increasing_vector_ids)
id_field_name = self._r(doc_spec, "id-field-name", mandatory=False,
default_value=default_id_field_name)
vector_field_name = self._r(doc_spec, "vector-field-name", mandatory=False,
default_value=default_vector_field_name)
if includes_action_and_meta_data:
target_idx = None
target_type = None
Expand Down Expand Up @@ -1558,6 +1570,9 @@ def _create_corpora(self, corpora_specs, indices, data_streams):
base_url=base_url,
source_url=source_url,
includes_action_and_meta_data=includes_action_and_meta_data,
generate_increasing_vector_ids=generate_increasing_vector_ids,
id_field_name = id_field_name,
vector_field_name = vector_field_name,
number_of_documents=num_docs,
compressed_size_in_bytes=compressed_bytes,
uncompressed_size_in_bytes=uncompressed_bytes,
Expand Down
Loading

0 comments on commit d2cfd72

Please sign in to comment.