benwtrent commented on issue #12342: URL: https://github.com/apache/lucene/issues/12342#issuecomment-1656112434
OK ran on two more datasets, I only ran over the first 100k documents in my data sets. I think this exhausts our testing for Cohere, we need to find additional data if this is not considered enough. # EN Reversed | Ordered | Random :-------------------------:|:-------------------------:|:-------------------------:  |  | # JA Reversed | Ordered | Random :-------------------------:|:-------------------------:|:-------------------------:  |  | <details> <summary> updating data creation (after download) </summary> ```python import numpy as np import pyarrow.parquet as pq DATA_SETS =[ {"name": "wiki768", "files": [ "train-00000-of-00004-1a1932c9ca1c7152.parquet", "train-00001-of-00004-f4a4f5540ade14b4.parquet", "train-00002-of-00004-ff770df3ab420d14.parquet", "train-00003-of-00004-85b3dbbc960e92ec.parquet", ]},{ "name": "wiki768en", "files": [ "0-en.parquet", "1-en.parquet", "2-en.parquet", "3-en.parquet", ]}, {"name": "wiki768ja", "files": [ "0-ja.parquet", "1-ja.parquet", "2-ja.parquet", "3-ja.parquet", ]} ] def transform_queries(Q): n, _ = Q.shape return np.concatenate([Q, np.zeros((n, 1))], axis=-1, dtype=np.float32) def transform_docs(D, norms): n, d = D.shape max_norm = magnitudes.max() flipped_norms = np.copy(norms).reshape(n, 1) transformed_data = np.concatenate([D, np.sqrt(max_norm**2 - flipped_norms**2)], axis=-1, dtype=np.float32) return transformed_data def validate_array_match_upto_dim(arr1, arr2, dim_eq_upto): assert np.allclose(arr1[:dim_eq_upto], arr2[:dim_eq_upto]), "data sets are different" def validate_dataset_match_upto_dim(arr1, arr2, dim_eq_upto): n1, d1 = arr1.shape n2, d2 = arr2.shape assert n1 == n2, f"Shape does not map [{arr1.shape}] vs [{arr2.shape}]" for i in range(n1): validate_array_match_upto_dim(arr1[i], arr2[i], dim_eq_upto) for ds in DATA_SETS: name = ds["name"] tb1 = pq.read_table(ds["files"][0], columns=['emb']) tb2 = pq.read_table(ds["files"][1], columns=['emb']) tb3 = pq.read_table(ds["files"][2], columns=['emb']) tb4 = pq.read_table(ds["files"][3], columns=['emb']) np1 = tb1[0].to_numpy() np2 = tb2[0].to_numpy() np3 = tb3[0].to_numpy() np4 = tb4[0].to_numpy() np_total = np.concatenate((np1, np2, np3, np4)) #Have to convert to a list here to get #the numpy ndarray's shape correct later #There's probably a better way... flat_ds = list() for vec in np_total: flat_ds.append(vec) np_flat_ds = np.array(flat_ds) row_count = np_flat_ds.shape[0] query_count = 10_000 training_rows = row_count - query_count print(f"{name} num rows: {training_rows}") transformed_queries = transform_queries(np_flat_ds[training_rows:-1]) validate_dataset_match_upto_dim(transformed_queries, np_flat_ds[training_rows:-1], 768) with open(f"{name}-transform.test", "w") as out_f: transformed_queries.tofile(out_f) with open(f"{name}.test", "w") as out_f: np_flat_ds[training_rows:-1].tofile(out_f) transformed_queries.tofile(out_f) magnitudes = np.linalg.norm(np_flat_ds[0:training_rows], axis=1) indices = np.argsort(magnitudes) transformed_np_flat_ds = transform_docs(np_flat_ds[0:training_rows], magnitudes) validate_dataset_match_upto_dim(transformed_np_flat_ds, np_flat_ds[0:training_rows], 768) transformed_np_flat_ds_sorted = transformed_np_flat_ds[indices] np_flat_ds_sorted = np_flat_ds[indices] with open(f"{name}.random-transform.train", "w") as out_f: transformed_np_flat_ds.tofile(out_f) with open(f"{name}.ordered-transform.train", "w") as out_f: transformed_np_flat_ds_sorted.tofile(out_f) with open(f"{name}.reversed-transform.train", "w") as out_f: np.flip(transformed_np_flat_ds_sorted, axis=0).tofile(out_f) with open(f"{name}.random.train", "w") as out_f: np.flip(np_flat_ds[0:training_rows], axis=0).tofile(out_f) with open(f"{name}.reversed.train", "w") as out_f: np.flip(np_flat_ds_sorted, axis=0).tofile(out_f) with open(f"{name}.ordered.train", "w") as out_f: np_flat_ds_sorted.tofile(out_f) ``` </details> <details> <summary>Useful parsing & plotting functions</summary> ```python def parse_console_output(terminal_output): # Regular expression patterns to extract recall and latency values recall_pattern = r"(?:\n\d+\.\d+)" latency_pattern = r"([\t, ]\d+\.\d+\t\d)" recall_values = [float(match.strip()) for match in re.findall(recall_pattern, terminal_output)] latency_values = [float(match.split()[0]) for match in re.findall(latency_pattern, terminal_output)] return (recall_values, latency_values) def plot_things(name, baseline_recall, baseline_latency, transformed_recall, transform_latency): # Plotting series one: transformed_recall vs transform_latency plt.plot(transform_latency, transformed_recall, marker='o', label='transformed') # Plotting series two: baseline_recall vs baseline_latency plt.plot(baseline_latency, baseline_recall, marker='o', label='original (baseline)') # Add labels and title plt.xlabel('Latency') plt.ylabel('Recall') plt.title(f"{name} Transformed vs Baseline recall & latency") plt.legend() # Show the plot plt.grid(True) plt.show() ``` To use them: ```python transformed_terminal_output = """ WARNING: Gnuplot module not present; will not make charts recall latency nDoc fanout maxConn beamWidth visited index ms WARNING: Using incubator modules: jdk.incubator.vector Jul 28, 2023 12:01:58 PM org.apache.lucene.internal.vectorization.PanamaVectorizationProvider <init> INFO: Java vector incubator API enabled; uses preferredBitSize=128 0.863 0.32 100000 0 48 200 10 0 1.00 post-filter ... """ baseline_terminal_output = """ WARNING: Gnuplot module not present; will not make charts recall latency nDoc fanout maxConn beamWidth visited index ms WARNING: Using incubator modules: jdk.incubator.vector Jul 28, 2023 12:34:59 PM org.apache.lucene.internal.vectorization.PanamaVectorizationProvider <init> INFO: Java vector incubator API enabled; uses preferredBitSize=128 0.816 0.23 100000 0 48 200 10 0 1.00 post-filter ... """ name = "WikiEN-Reversed" transformed_recall, transform_latency = parse_console_output(transformed_terminal_output) baseline_recall, baseline_latency = parse_console_output(baseline_terminal_output) plot_things(name, baseline_recall, baseline_latency, transformed_recall, transform_latency) ``` </details> <details> <summary>data downloading script</summary> ```sh #!/bin/sh # Japanese curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-ja-embeddings/parquet/Cohere--wikipedia-22-12-ja-embeddings/train/0.parquet -o 0-ja.parquet curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-ja-embeddings/parquet/Cohere--wikipedia-22-12-ja-embeddings/train/1.parquet -o 1-ja.parquet curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-ja-embeddings/parquet/Cohere--wikipedia-22-12-ja-embeddings/train/33.parquet -o 2-ja.parquet curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-ja-embeddings/parquet/Cohere--wikipedia-22-12-ja-embeddings/train/34.parquet -o 3-ja.parquet # English curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-en-embeddings/parquet/Cohere--wikipedia-22-12-en-embeddings/train/0.parquet -o 0-en.parquet curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-en-embeddings/parquet/Cohere--wikipedia-22-12-en-embeddings/train/1.parquet -o 1-en.parquet curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-en-embeddings/parquet/Cohere--wikipedia-22-12-en-embeddings/train/251.parquet -o 2-en.parquet curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-en-embeddings/parquet/Cohere--wikipedia-22-12-en-embeddings/train/252.parquet -o 3-en.parquet ``` </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org