benwtrent commented on issue #12342: URL: https://github.com/apache/lucene/issues/12342#issuecomment-1659123368
I found another dataset, Yandex Text-to-image: https://research.yandex.com/blog/benchmarks-for-billion-scale-similarity-search I tested against the first 500_000 values in the 1M dataset. It utilizes inner-product, looking at magnitudes, they are all < 1. So this dataset might not be that useful :/. The magnitudes range from 0.79 - 0.99. I am just looking for more realistic inner-product search data sets and found this one. # Yandex Image-to-Text Reversed | Ordered | Random :-------------------------:|:-------------------------:|:-------------------------:  || <details> <summary>code for transforming yandex fbin</summary> ```python import numpy as np def read_fbin(filename, start_idx=0, chunk_size=None): """ Read *.fbin file that contains float32 vectors Args: :param filename (str): path to *.fbin file :param start_idx (int): start reading vectors from this index :param chunk_size (int): number of vectors to read. If None, read all vectors Returns: Array of float32 vectors (numpy.ndarray) """ with open(filename, "rb") as f: nvecs, dim = np.fromfile(f, count=2, dtype=np.int32) nvecs = (nvecs - start_idx) if chunk_size is None else chunk_size arr = np.fromfile(f, count=nvecs * dim, dtype=np.float32, offset=start_idx * 4 * dim) return arr.reshape(nvecs, dim) def transform_queries(Q): n, _ = Q.shape return np.concatenate([Q, np.zeros((n, 1))], axis=-1, dtype=np.float32) def transform_docs(D, norms): n, d = D.shape max_norm = magnitudes.max() flipped_norms = np.copy(norms).reshape(n, 1) transformed_data = np.concatenate([D, np.sqrt(max_norm**2 - flipped_norms**2)], axis=-1, dtype=np.float32) return transformed_data def validate_array_match_upto_dim(arr1, arr2, dim_eq_upto): assert np.allclose(arr1[:dim_eq_upto], arr2[:dim_eq_upto]), "data sets are different" def validate_dataset_match_upto_dim(arr1, arr2, dim_eq_upto): n1, d1 = arr1.shape n2, d2 = arr2.shape assert n1 == n2, f"Shape does not map [{arr1.shape}] vs [{arr2.shape}]" for i in range(n1): validate_array_match_upto_dim(arr1[i], arr2[i], dim_eq_upto) name = "yandex" np_flat_ds = read_fbin("base.1M.fbin")[0:500_000] queries = read_fbin("query.public.100k.fbin")[0:10_000] transformed_queries = transform_queries(queries) validate_dataset_match_upto_dim(transformed_queries, queries, 200) with open(f"{name}-transform.test", "w") as out_f: transformed_queries.tofile(out_f) with open(f"{name}.test", "w") as out_f: queries.tofile(out_f) magnitudes = np.linalg.norm(np_flat_ds, axis=1) indices = np.argsort(magnitudes) transformed_np_flat_ds = transform_docs(np_flat_ds, magnitudes) validate_dataset_match_upto_dim(transformed_np_flat_ds, np_flat_ds, 200) transformed_np_flat_ds_sorted = transformed_np_flat_ds[indices] np_flat_ds_sorted = np_flat_ds[indices] with open(f"{name}.random-transform.train", "w") as out_f: transformed_np_flat_ds.tofile(out_f) with open(f"{name}.ordered-transform.train", "w") as out_f: transformed_np_flat_ds_sorted.tofile(out_f) with open(f"{name}.reversed-transform.train", "w") as out_f: np.flip(transformed_np_flat_ds_sorted, axis=0).tofile(out_f) with open(f"{name}.random.train", "w") as out_f: np_flat_ds.tofile(out_f) with open(f"{name}.reversed.train", "w") as out_f: np.flip(np_flat_ds_sorted, axis=0).tofile(out_f) with open(f"{name}.ordered.train", "w") as out_f: np_flat_ds_sorted.tofile(out_f) ``` </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org