diff --git a/dataset_reader/ann_h5_reader.py b/dataset_reader/ann_h5_reader.py index 1bc984ac..950270c3 100644 --- a/dataset_reader/ann_h5_reader.py +++ b/dataset_reader/ann_h5_reader.py @@ -1,3 +1,4 @@ +import itertools from typing import Iterator import h5py @@ -14,9 +15,10 @@ def __init__(self, path, normalize=False): def read_queries(self) -> Iterator[Query]: data = h5py.File(self.path) + distances = data["distances"] if "distances" in data else itertools.repeat(None) for vector, expected_result, expected_scores in zip( - data["test"], data["neighbors"], data["distances"] + data["test"], data["neighbors"], distances ): if self.normalize: vector /= np.linalg.norm(vector) @@ -24,7 +26,7 @@ def read_queries(self) -> Iterator[Query]: vector=vector.tolist(), meta_conditions=None, expected_result=expected_result.tolist(), - expected_scores=expected_scores.tolist(), + expected_scores=expected_scores.tolist() if expected_scores is not None else None, ) def read_data(self, *args, **kwargs) -> Iterator[Record]: diff --git a/datasets/datasets.json b/datasets/datasets.json index 784cab9b..a18d466e 100644 --- a/datasets/datasets.json +++ b/datasets/datasets.json @@ -1219,5 +1219,21 @@ "type": "tar", "link": "https://storage.googleapis.com/ann-filtered-benchmark/datasets/random_keywords_1m_vocab_10_no_filters.tgz", "path": "random-100-match-kw-small-vocab/random_keywords_1m_vocab_10_no_filters" + }, + { + "name": "cohere-768-1M", + "vector_size": 768, + "distance": "dot", + "type": "h5", + "path": "cohere-768-1M/cohere-768-1M.hdf5", + "link": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings/documents-1m.hdf5.bz2" + }, + { + "name": "cohere-768-10M", + "vector_size": 768, + "distance": "dot", + "type": "h5", + "path": "cohere-768-10M/cohere-768-10M.hdf5", + "link": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings/documents-10m.hdf5.bz2" } ] diff --git a/experiments/configurations/cohere-calibration.json b/experiments/configurations/cohere-calibration.json new file mode 100644 index 00000000..8f3574a0 --- /dev/null +++ b/experiments/configurations/cohere-calibration.json @@ -0,0 +1,456 @@ +[ + { + "name": "cohere-cal-hnsw-float16", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "hnsw", + "data_type": "FLOAT16", + "hnsw_config": { + "M": 32, + "DISTANCE_METRIC": "IP", + "EF_CONSTRUCTION": 200 + } + }, + "search_params": [ + { + "parallel": 100, + "top": 100, + "calibration_param": "ef", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT16" + } + } + ], + "upload_params": { + "parallel": 100, + "algorithm": "hnsw", + "data_type": "FLOAT16" + } + }, + { + "name": "cohere-cal-hnsw-float32", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "hnsw", + "data_type": "FLOAT32", + "hnsw_config": { + "M": 32, + "DISTANCE_METRIC": "IP", + "EF_CONSTRUCTION": 200 + } + }, + "search_params": [ + { + "parallel": 100, + "top": 100, + "calibration_param": "ef", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 100, + "algorithm": "hnsw", + "data_type": "FLOAT32" + } + }, +{ + "name": "cohere-cal-svs-noquant-float16", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT16", + "svs-vamana_config": { + "NUM_THREADS": 16, + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200 + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "WS_SEARCH", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT16" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT16", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-noquant-float32", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT32", + "svs-vamana_config": { + "NUM_THREADS": 16, + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200 + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "WS_SEARCH", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT32", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-SQ8-float16", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT16", + "svs-vamana_config": { + "NUM_THREADS": 16, + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "GlobalSQ8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "WS_SEARCH", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT16" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT16", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-SQ8-float32", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT32", + "svs-vamana_config": { + "NUM_THREADS": 16, + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "GlobalSQ8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "WS_SEARCH", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT32", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LVQ8-float16", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT16", + "svs-vamana_config": { + "NUM_THREADS": 16, + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LVQ8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "WS_SEARCH", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT16" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT16", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LVQ8-float32", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT32", + "svs-vamana_config": { + "NUM_THREADS": 16, + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LVQ8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "WS_SEARCH", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT32", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LVQ4X8-float16", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT16", + "svs-vamana_config": { + "NUM_THREADS": 16, + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LVQ4X8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "WS_SEARCH", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT16" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT16", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LVQ4X8-float32", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT32", + "svs-vamana_config": { + "NUM_THREADS": 16, + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LVQ4X8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "WS_SEARCH", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT32", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LVQ4X4-float32", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT32", + "svs-vamana_config": { + "NUM_THREADS": 16, + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LVQ4X4" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "WS_SEARCH", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT32", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LVQ4X4-float16", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT16", + "svs-vamana_config": { + "NUM_THREADS": 16, + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LVQ4X4" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "WS_SEARCH", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT16" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT16", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LeanVec4x8-float16", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT16", + "svs-vamana_config": { + "NUM_THREADS": 16, + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LeanVec4x8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "WS_SEARCH", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT16" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT16", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LeanVec4x8-float32", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT32", + "svs-vamana_config": { + "NUM_THREADS": 16, + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LeanVec4x8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "WS_SEARCH", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT32", + "algorithm": "svs-vamana" + } + } +]