benwtrent commented on issue #12342: URL: https://github.com/apache/lucene/issues/12342#issuecomment-1668237698
Here are my results for the en-ja mixed dataset. max inner product in baseline, euclidean in transformed # EN-Ja rerun Reversed | Ordered | Random :-------------------------:|:-------------------------:|:-------------------------:  || <details> <summary>Updated knnPerfTest.py </summary> My testing run I do the following: - Run all files & tests with just `fanout:0` but with `-reindex` passed as an arg. This builds the indices - Then I remove the `-reindex` the param and run with all the fanout parameters. I found this to be the quickest way to test. ```python i#!/usr/bin/env/python import subprocess import benchUtil import constants LUCENE_CHECKOUT = 'lucene_candidate' # test parameters. This script will run KnnGraphTester on every combination of these parameters VALUES = { 'ndoc': (100000,), 'maxConn': (48, ), 'beamWidthIndex': (200,), 'fanout': (0, 10, 50, 90, 190, 490, 590, 990), 'topK': (10,), } def advance(ix, values): for i in reversed(range(len(ix))): param = list(values.keys())[i] #print("advance " + param) if ix[i] == len(values[param]) - 1: ix[i] = 0 else: ix[i] += 1 return True return False def run_knn_benchmark(checkout, values, training_file, testing_file, dims, metric): indexes = [0] * len(values.keys()) indexes[-1] = -1 args = [] print(f"\n\n\nNow running{training_file}\n\n\n") dim = dims #768 doc_vectors = training_file query_vectors = testing_file cp = benchUtil.classPathToString(benchUtil.getClassPath(checkout)) JAVA_EXE = '/Users/benjamintrent/Library/Java/JavaVirtualMachines/jdk-20.0.1.jdk/Contents/Home/bin/java' cmd = [JAVA_EXE, '-cp', cp, '--add-modules', 'jdk.incubator.vector', '-Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false', 'KnnGraphTester'] print("recall\tlatency\tnDoc\tfanout\tmaxConn\tbeamWidth\tvisited\tindex ms") while advance(indexes, values): pv = {} args = [] for (i, p) in enumerate(list(values.keys())): if p in values: if values[p]: value = values[p][indexes[i]] pv[p] = value else: args += ['-' + p] args += [a for (k, v) in pv.items() for a in ('-' + k, str(v)) if a] this_cmd = cmd + args + [ '-dim', str(dim), '-docs', doc_vectors, #'-reindex', '-metric', metric, '-search', query_vectors, '-forceMerge', '-quiet', ] subprocess.run(this_cmd) tests = [ ('%s/util/en_ja.random.train' % constants.BASE_DIR, '%s/util/en_ja.test' % constants.BASE_DIR, 768, "angular"), ('%s/util/en_ja.ordered.train' % constants.BASE_DIR, '%s/util/en_ja.test' % constants.BASE_DIR, 768, "angular"), ('%s/util/en_ja.reversed.train' % constants.BASE_DIR, '%s/util/en_ja.test' % constants.BASE_DIR, 768, "angular"), ('%s/util/en_ja.random-transform.train' % constants.BASE_DIR, '%s/util/en_ja-transform.test' % constants.BASE_DIR, 769, "euclidean"), ('%s/util/en_ja.ordered-transform.train' % constants.BASE_DIR, '%s/util/en_ja-transform.test' % constants.BASE_DIR, 769, "euclidean"), ('%s/util/en_ja.reversed-transform.train' % constants.BASE_DIR, '%s/util/en_ja-transform.test' % constants.BASE_DIR, 769, "euclidean"), ] for (training_file, testing_file, dims, metric) in tests: run_knn_benchmark(LUCENE_CHECKOUT, VALUES, training_file, testing_file, dims, metric) ``` </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org