https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/78880
>From 80c9507d7f49ddbc5f2554f597950f797355c255 Mon Sep 17 00:00:00 2001 From: Aiden Grossman <agrossman...@yahoo.com> Date: Sun, 21 Jan 2024 03:53:03 +0000 Subject: [PATCH 1/7] Add make_corpus script test --- .../tests/corpus/make_corpus_script.test | 22 +++++++++++++++++++ llvm/utils/mlgo-utils/tests/lit.cfg | 7 +++++- 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test new file mode 100644 index 000000000000000..f4f97544bce47d3 --- /dev/null +++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test @@ -0,0 +1,22 @@ +## Testing that the make_corpus script works as expected when invoked. + +# RUN: rm -rf %t.dir && mkdir %t.dir +# RUN: touch %t.dir/test1.bc +# RUN: touch %t.dir/test2.bc +# RUN: rm -rf %t.out.dir && mkdir %t.out.dir + +# RUN: %python %scripts_dir/corpus/make_corpus.py --input_dir=%t.dir --output_dir=%t.out.dir --default_args="-test" + +# RUN: cat %t.out.dir/corpus_description.json | FileCheck %s + +## Check that we get the expected command in the global command override +# CHECK: "-test" +# CHECK: "has_thinlto": false +## Check that the modules are in the corpus description +# CHECK: "test1" +# CHECK: "test2" + +# RUN: ls %t.out.dir | FileCheck %s --check-prefix CHECK-DIR + +# CHECK-DIR: test1.bc +# CHECK-DIR: test2.bc diff --git a/llvm/utils/mlgo-utils/tests/lit.cfg b/llvm/utils/mlgo-utils/tests/lit.cfg index 055f0945942fc1c..9afced53f195c5f 100644 --- a/llvm/utils/mlgo-utils/tests/lit.cfg +++ b/llvm/utils/mlgo-utils/tests/lit.cfg @@ -1,3 +1,5 @@ +import os + import lit.formats from lit.llvm import llvm_config @@ -5,7 +7,7 @@ from lit.llvm import llvm_config config.name = "mlgo-utils" config.test_format = lit.formats.ShTest(execute_external=False) -config.suffixes = [".py"] +config.suffixes = [".py", ".test"] config.test_source_root = os.path.dirname(__file__) config.test_exec_root = config.obj_root @@ -13,3 +15,6 @@ config.test_exec_root = config.obj_root config.environment["PYTHONPATH"] = os.path.join(config.src_root, "utils", "mlgo-utils") llvm_config.use_default_substitutions() + +scripts_dir = os.path.join(config.src_root, "utils/mlgo-utils/mlgo") +config.substitutions.append(("%scripts_dir", scripts_dir)) >From d99f5d4cd2c7c6d9e70125e893dc2ae40c897d36 Mon Sep 17 00:00:00 2001 From: Aiden Grossman <agrossman...@yahoo.com> Date: Sun, 21 Jan 2024 04:58:42 +0000 Subject: [PATCH 2/7] Add another test --- llvm/utils/mlgo-utils/CMakeLists.txt | 2 +- .../combine_training_corpus_script.test | 29 +++++++++++++++++++ llvm/utils/mlgo-utils/tests/lit.cfg | 1 + 3 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test diff --git a/llvm/utils/mlgo-utils/CMakeLists.txt b/llvm/utils/mlgo-utils/CMakeLists.txt index 7b303c7639401ae..c263c92c632797e 100644 --- a/llvm/utils/mlgo-utils/CMakeLists.txt +++ b/llvm/utils/mlgo-utils/CMakeLists.txt @@ -5,7 +5,7 @@ configure_lit_site_cfg( add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests" ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS "FileCheck" "not" "count" + DEPENDS "FileCheck" "not" "count" "split-file" ) set_target_properties(check-mlgo-utils PROPERTIES FOLDER "Tests") diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test new file mode 100644 index 000000000000000..1aa182146a49ee4 --- /dev/null +++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test @@ -0,0 +1,29 @@ +## Testing that the combine_trainig_corpus script works as expected when +## invoked. + +# RUN: rm -rf %t.dir && mkdir %t.dir +# RUN: split-file %s %t.dir +# RUN: %python %scripts_dir/corpus/combine_training_corpus.py --root_dir=%t.dir +# RUN: cat %t.dir/corpus_description.json | FileCheck %s + +## Check that we end up with the same properties as the original corpora +# CHECK: "has_thinlto": false + +## Check that the modules end up in the combined corpus. Order does not matter. +# CHECK-DAG: "subcorpus1/test1.o" +# CHECK-DAG: "subcorpus2/test2.o" + +#--- subcorpus1/corpus_description.json +{ + "has_thinlto": false, + "modules": [ + "test1.o" + ] +} +#--- subcorpus2/corpus_description.json +{ + "has_thinlto": false, + "modules": [ + "test2.o" + ] +} diff --git a/llvm/utils/mlgo-utils/tests/lit.cfg b/llvm/utils/mlgo-utils/tests/lit.cfg index 9afced53f195c5f..58c35e69c652c58 100644 --- a/llvm/utils/mlgo-utils/tests/lit.cfg +++ b/llvm/utils/mlgo-utils/tests/lit.cfg @@ -15,6 +15,7 @@ config.test_exec_root = config.obj_root config.environment["PYTHONPATH"] = os.path.join(config.src_root, "utils", "mlgo-utils") llvm_config.use_default_substitutions() +config.substitutions.append(("split-file", llvm_config.use_llvm_tool("split-file"))) scripts_dir = os.path.join(config.src_root, "utils/mlgo-utils/mlgo") config.substitutions.append(("%scripts_dir", scripts_dir)) >From 0f2d0cd83efb07fdaee048b49f2562f4372c944d Mon Sep 17 00:00:00 2001 From: Aiden Grossman <agrossman...@yahoo.com> Date: Sun, 21 Jan 2024 05:16:05 +0000 Subject: [PATCH 3/7] Add extract_ir test --- llvm/utils/mlgo-utils/CMakeLists.txt | 2 +- .../tests/corpus/extract_ir_script.test | 44 +++++++++++++++++++ llvm/utils/mlgo-utils/tests/lit.cfg | 2 + 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test diff --git a/llvm/utils/mlgo-utils/CMakeLists.txt b/llvm/utils/mlgo-utils/CMakeLists.txt index c263c92c632797e..3129331d58c75bb 100644 --- a/llvm/utils/mlgo-utils/CMakeLists.txt +++ b/llvm/utils/mlgo-utils/CMakeLists.txt @@ -5,7 +5,7 @@ configure_lit_site_cfg( add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests" ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS "FileCheck" "not" "count" "split-file" + DEPENDS "FileCheck" "not" "count" "split-file" "yaml2obj" "llvm-objcopy" ) set_target_properties(check-mlgo-utils PROPERTIES FOLDER "Tests") diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test new file mode 100644 index 000000000000000..a7629eb629219d7 --- /dev/null +++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test @@ -0,0 +1,44 @@ +## Test that invoking the extract_ir script work as expected. + +# RUN: rm -rf %t.dir && mkdir %t.dir +# RUN: yaml2obj %s -o %t.dir/test1.o +# RUN: yaml2obj %s -o %t.dir/test2.o +# RUN: rm -rf %t.dir.out && mkdir %t.dir.out + +# RUN: %python %scripts_dir/corpus/extract_ir.py --input=%t.dir --input_type=directory --output_dir=%t.dir.out --llvm_objcopy_path=llvm-objcopy +# RUN: cat %t.dir.out/corpus_description.json | FileCheck %s + +## Check that this is not a thinLTO build +# CHECK: "has_thinlto": false +## Check that the expected modules end up in the corpus description +# CHECK-DAG: "test1.o" +# CHECK-DAG: "test2.o" + +# RUN: ls %t.dir.out | FileCheck %s --check-prefix CHECK-DIR + +# CHECK-DIR: test1.o.bc +# CHECK-DIR: test1.o.cmd +# CHECK-DIR: test2.o.bc +# CHECK-DIR: test2.o.cmd + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_X86_64 + SectionHeaderStringTable: .strtab +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + AddressAlign: 0x10 + Content: 55 + - Name: .llvmbc + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: 55 + - Name: .llvmcmd + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: ff diff --git a/llvm/utils/mlgo-utils/tests/lit.cfg b/llvm/utils/mlgo-utils/tests/lit.cfg index 58c35e69c652c58..0f6137e5e91383e 100644 --- a/llvm/utils/mlgo-utils/tests/lit.cfg +++ b/llvm/utils/mlgo-utils/tests/lit.cfg @@ -16,6 +16,8 @@ config.environment["PYTHONPATH"] = os.path.join(config.src_root, "utils", "mlgo- llvm_config.use_default_substitutions() config.substitutions.append(("split-file", llvm_config.use_llvm_tool("split-file"))) +config.substitutions.append(("yaml2obj", llvm_config.use_llvm_tool("yaml2obj"))) +config.substitutions.append(("llvm-objcopy", llvm_config.use_llvm_tool("llvm-objcopy"))) scripts_dir = os.path.join(config.src_root, "utils/mlgo-utils/mlgo") config.substitutions.append(("%scripts_dir", scripts_dir)) >From 01dd2821526a435524fbe2d4cad0fff4b880a8fd Mon Sep 17 00:00:00 2001 From: Aiden Grossman <agrossman...@yahoo.com> Date: Sun, 21 Jan 2024 06:08:28 +0000 Subject: [PATCH 4/7] [MLGO] Remove absl dependency from scripts This patch removes the absl dependency from the mlgo-utils scripts. We were only using absl.logging, and absl.flags, so this patch just consists of mechanically converting the absl flags parsing to Python's builtin argparse as Python's logging is a drop in replacement for absl.logging. --- .../mlgo/corpus/combine_training_corpus.py | 27 +-- .../mlgo-utils/mlgo/corpus/extract_ir.py | 204 +++++++++--------- .../mlgo-utils/mlgo/corpus/make_corpus.py | 45 ++-- 3 files changed, 137 insertions(+), 139 deletions(-) diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py index 9aabd87b4688e00..cc21061cbbef5ea 100644 --- a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py +++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py @@ -23,26 +23,21 @@ and corpus2 are combined into combinedcorpus. """ -from absl import app -from absl import flags +import argparse from mlgo.corpus import combine_training_corpus_lib -flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.") -FLAGS = flags.FLAGS - - -def main(argv): - if len(argv) > 1: - raise app.UsageError("Too many command-line arguments.") - - combine_training_corpus_lib.combine_corpus(FLAGS.root_dir) - - -def entrypoint(): - app.run(main) +def main(args): + combine_training_corpus_lib.combine_corpus(args.root_dir) if __name__ == "__main__": - entrypoint() + parser = argparse.ArgumentParser( + description="A tool for combining multiple training corpora" + ) + parser.add_argument( + "--root_dir", type=str, help="The root dir of module paths to combine." + ) + args = parser.parse_args() + main(args) diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py index 9463e61dc534fed..4426463e22b0e74 100644 --- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py +++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py @@ -26,127 +26,59 @@ import json import multiprocessing - -from absl import app -from absl import flags -from absl import logging +import logging +import argparse from mlgo.corpus import extract_ir_lib -flags.DEFINE_string( - "input", - None, - "Input file or directory - either compile_commands.json, a linker parameter" - "list, or a path to a directory containing object files.", -) -flags.DEFINE_enum( - "input_type", - "json", - ["json", "params", "directory"], - "Input file type - json, params, or directory. params latter refers to lld" - "params.", -) -flags.DEFINE_string("output_dir", None, "Output directory") -flags.DEFINE_integer( - "num_workers", - None, - "Number of parallel workers for objcopy. `None` for maximum available.", -) -flags.DEFINE_string("llvm_objcopy_path", "llvm-objcopy", "Path to llvm-objcopy") -flags.DEFINE_string( - "obj_base_dir", - "", - "Base directory for object files. Defaults to current working dir.", -) -flags.DEFINE_string( - "cmd_filter", - None, - "Include only those modules with a command line matching this regexp. " - "Setting it to None for not filtering. Note that the regexp is applied " - "independently for each separate command line option. For example, ^-Oz$ " - "will match Oz - built binaries. Does not work with thinlto_build=lld.", -) -flags.DEFINE_enum( - "thinlto_build", - None, - ["distributed", "local"], - "Set if the build was performed with either 'distributed' or " - "'local' ThinLTO. This ensures the thinlto.bc files are also copied. " - "The build is assumed to have had " - "-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed " - "case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files " - "passed in the local case.", -) -flags.DEFINE_string( - "cmd_section_name", - ".llvmcmd", - "The section name passed to llvm-objcopy. For ELF object files, the " - "default .llvmcmd is correct. For Mach-O object files, one should use " - "something like __LLVM,__cmdline", -) -flags.DEFINE_string( - "bitcode_section_name", - ".llvmbc", - "The section name passed to llvm-objcopy. For ELF object files, the " - "default .llvmbc is correct. For Mach-O object files, one should use " - "__LLVM,__bitcode", -) - -flags.mark_flag_as_required("output_dir") - -FLAGS = flags.FLAGS - - -def main(argv): - if len(argv) > 1: - raise app.UsageError("Too many command-line arguments.") +def main(args): objs = [] - if FLAGS.input is not None and FLAGS.thinlto_build == "local": + if args.input is not None and args.thinlto_build == "local": raise ValueError("--thinlto_build=local cannot be run with --input") - if FLAGS.input is None: - if FLAGS.thinlto_build != "local": + if args.input is None: + if args.thinlto_build != "local": raise ValueError("--input or --thinlto_build=local must be provided") - objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir) - elif FLAGS.input_type == "json": - with open(FLAGS.input, encoding="utf-8") as f: + objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) + elif args.input_type == "json": + with open(args.input, encoding="utf-8") as f: objs = extract_ir_lib.load_from_compile_commands( - json.load(f), FLAGS.output_dir + json.load(f), args.output_dir ) - elif FLAGS.input_type == "params": - if not FLAGS.obj_base_dir: + elif args.input_type == "params": + if not args.obj_base_dir: logging.info( "-obj_base_dir is unspecified, assuming current directory." "If no objects are found, use this option to specify the root" "directory for the object file paths in the input file." ) - with open(FLAGS.input, encoding="utf-8") as f: + with open(args.input, encoding="utf-8") as f: objs = extract_ir_lib.load_from_lld_params( - [l.strip() for l in f.readlines()], FLAGS.obj_base_dir, FLAGS.output_dir + [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir ) - elif FLAGS.input_type == "directory": + elif args.input_type == "directory": logging.warning( "Using the directory input is only recommended if the build system" "your project uses does not support any structured output that" "ml-compiler-opt understands. If your build system provides a" "structured compilation database, use that instead" ) - objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir) + objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) else: - logging.error("Unknown input type: %s", FLAGS.input_type) + logging.error("Unknown input type: %s", args.input_type) relative_output_paths = extract_ir_lib.run_extraction( objs, - FLAGS.num_workers, - FLAGS.llvm_objcopy_path, - FLAGS.cmd_filter, - FLAGS.thinlto_build, - FLAGS.cmd_section_name, - FLAGS.bitcode_section_name, + args.num_workers, + args.llvm_objcopy_path, + args.cmd_filter, + args.thinlto_build, + args.cmd_section_name, + args.bitcode_section_name, ) extract_ir_lib.write_corpus_manifest( - FLAGS.thinlto_build, relative_output_paths, FLAGS.output_dir + args.thinlto_build, relative_output_paths, args.output_dir ) logging.info( @@ -156,10 +88,86 @@ def main(argv): ) -def entrypoint(): - multiprocessing.set_start_method("fork") - app.run(main) - - if __name__ == "__main__": - entrypoint() + parser = argparse.ArgumentParser( + description="A tool for making a corpus from build artifacts" + ) + parser.add_argument( + "--input", + type=str, + help="Input file or directory - either compile_commands.json, a linker " + "parameter list, or a path to a directory containing object files.", + ) + parser.add_argument( + "--input_type", + type=str, + help="Input file type - JSON, LLD params, or directory.", + choices=["json", "params", "directory"], + default="json", + nargs="?", + ) + parser.add_argument("--output_dir", type=str, help="Output directory") + parser.add_argument( + "--num_workers", + type=int, + help="Number of parallel works for objcopy. `None` for maximum available.", + default=None, + nargs="?", + ) + parser.add_argument( + "--llvm_objcopy_path", + type=str, + help="Path to llvm-objcopy", + default="llvm-objcopy", + nargs="?", + ) + parser.add_argument( + "--obj_base_dir", + type=str, + help="Base directory for object files. Defaults to current working dir.", + default="", + nargs="?", + ) + parser.add_argument( + "--cmd_filter", + type=str, + help="Include only those modules with a command line matching this regular " + "expression. Set it to None to not perform any filtering. Note that the " + "regular expression is applied independently for each separate command line " + "option. For example, ^-Oz$ will match Oz built binaries. This does not work " + "with thinlto_build=lld.", + default=None, + nargs="?", + ) + parser.add_argument( + "--thinlto_build", + type=str, + help="Set if the build was performed with either 'distributed' or 'local' " + "ThinLTO. This ensures the thinlto.bc files are also copied. The build is " + "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in " + "the distributed case or -Wl,--save-temps=import and " + "-Wl,--thinlto-emit-index-files passed in the local case", + choices=["distributed", "local"], + default=None, + nargs="?", + ) + parser.add_argument( + "--cmd_section_name", + type=str, + help="The section name passed to llvm-objcopy. For ELF object files, the " + "default .llvmcmd is correct. For Mach-O object files, one should use " + "something like __LLVM,__cmdline", + default=".llvmcmd", + nargs="?", + ) + parser.add_argument( + "--bitcode_section_name", + type=str, + help="The section name passed to llvm-objcopy. For ELF object files, the " + "default .llvmbc is correct. For Mach-O object files, one should use " + "__LLVM,__bitcode", + default=".llvmbc", + nargs="?", + ) + args = parser.parse_args() + main(args) diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py index edb0ecd853de246..05ceb750de673ef 100644 --- a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py +++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py @@ -12,43 +12,38 @@ --default_args="<list of space separated flags>" """ -from absl import app -from absl import flags -from absl import logging +import logging +import argparse from mlgo.corpus import make_corpus_lib -flags.DEFINE_string("input_dir", None, "The input directory.") -flags.DEFINE_string("output_dir", None, "The output directory.") -flags.DEFINE_string( - "default_args", - "", - "The compiler flags to compile with when using downstream tooling.", -) -flags.mark_flag_as_required("input_dir") -flags.mark_flag_as_required("output_dir") - -FLAGS = flags.FLAGS - - -def main(_): +def main(args): logging.warning( "Using this tool does not guarantee that the bitcode is taken at " "the correct stage for consumption during model training. Make " "sure to validate assumptions about where the bitcode is coming " "from before using it in production." ) - relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir) - make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir, FLAGS.output_dir) + relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir) + make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir) make_corpus_lib.write_corpus_manifest( - relative_paths, FLAGS.output_dir, FLAGS.default_args.split() + relative_paths, args.output_dir, args.default_args.split() ) -def entrypoint(): - app.run(main) - - if __name__ == "__main__": - entrypoint() + parser = argparse.ArgumentParser( + description="A tool for making a corpus from arbitrary bitcode" + ) + parser.add_argument("--input_dir", type=str, help="The input directory.") + parser.add_argument("--output_dir", type=str, help="The output directory.") + parser.add_argument( + "--default_args", + type=str, + help="The compiler flags to compile with when using downstream tooling.", + default="", + nargs="?", + ) + args = parser.parse_args() + main(args) >From 69e230ad1bae8060c9055157eced593b15b436ee Mon Sep 17 00:00:00 2001 From: Aiden Grossman <agrossman...@yahoo.com> Date: Sun, 21 Jan 2024 06:12:29 +0000 Subject: [PATCH 5/7] Add requires lines to tests --- .../mlgo-utils/tests/corpus/combine_training_corpus_script.test | 2 ++ llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test | 2 ++ llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test | 2 ++ 3 files changed, 6 insertions(+) diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test index 1aa182146a49ee4..933a9c2b9f811e2 100644 --- a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test +++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test @@ -1,3 +1,5 @@ +# REQUIRES: python-38, absl, system-linux + ## Testing that the combine_trainig_corpus script works as expected when ## invoked. diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test index a7629eb629219d7..c20581dacdc6516 100644 --- a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test +++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test @@ -1,3 +1,5 @@ +# REQUIRES: python-38, absl, system-linux + ## Test that invoking the extract_ir script work as expected. # RUN: rm -rf %t.dir && mkdir %t.dir diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test index f4f97544bce47d3..3c1b96523718e44 100644 --- a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test +++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test @@ -1,3 +1,5 @@ +# REQUIRES: python-38, absl, system-linux + ## Testing that the make_corpus script works as expected when invoked. # RUN: rm -rf %t.dir && mkdir %t.dir >From b919c42c768a3343056998a0643c3a02896b1202 Mon Sep 17 00:00:00 2001 From: Aiden Grossman <agrossman...@yahoo.com> Date: Sun, 21 Jan 2024 06:14:14 +0000 Subject: [PATCH 6/7] Remove other references to absl dep --- llvm/utils/mlgo-utils/pyproject.toml | 3 --- .../tests/corpus/combine_training_corpus_script.test | 2 +- llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test | 2 +- .../utils/mlgo-utils/tests/corpus/make_corpus_script.test | 2 +- llvm/utils/mlgo-utils/tests/lit.local.cfg | 8 -------- 5 files changed, 3 insertions(+), 14 deletions(-) diff --git a/llvm/utils/mlgo-utils/pyproject.toml b/llvm/utils/mlgo-utils/pyproject.toml index be2af86cd05df30..dac18a785c17b93 100644 --- a/llvm/utils/mlgo-utils/pyproject.toml +++ b/llvm/utils/mlgo-utils/pyproject.toml @@ -7,9 +7,6 @@ name = "mlgo" description = "Tooling for ML in LLVM" readme = "README.md" requires-python = ">=3.8,<3.11" -dependencies = [ - "absl-py>=1.0.0" -] dynamic = ["version"] license = {text = "Apache-2.0 WITH LLVM-exception"} classifiers = [ diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test index 933a9c2b9f811e2..51dc637347caf09 100644 --- a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test +++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test @@ -1,4 +1,4 @@ -# REQUIRES: python-38, absl, system-linux +# REQUIRES: python-38, system-linux ## Testing that the combine_trainig_corpus script works as expected when ## invoked. diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test index c20581dacdc6516..107116618ce97bb 100644 --- a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test +++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test @@ -1,4 +1,4 @@ -# REQUIRES: python-38, absl, system-linux +# REQUIRES: python-38, system-linux ## Test that invoking the extract_ir script work as expected. diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test index 3c1b96523718e44..a08780055f31f1e 100644 --- a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test +++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test @@ -1,4 +1,4 @@ -# REQUIRES: python-38, absl, system-linux +# REQUIRES: python-38, system-linux ## Testing that the make_corpus script works as expected when invoked. diff --git a/llvm/utils/mlgo-utils/tests/lit.local.cfg b/llvm/utils/mlgo-utils/tests/lit.local.cfg index 90cdf8ba618ed8f..a9088750cb58b1e 100644 --- a/llvm/utils/mlgo-utils/tests/lit.local.cfg +++ b/llvm/utils/mlgo-utils/tests/lit.local.cfg @@ -4,11 +4,3 @@ import sys # the entire project has been bumped to 3.8. if sys.version_info > (3,8): config.available_features.add("python-38") - -# TODO(boomanaiden154): Remove this flag once the scripts are converted to -# not use absl anymore. -try: - import absl - config.available_features.add("absl") -except: - pass >From 336ccb0d643ed627343d2e38d5dbdd659e8688bc Mon Sep 17 00:00:00 2001 From: Aiden Grossman <agrossman...@yahoo.com> Date: Sun, 21 Jan 2024 23:01:09 +0000 Subject: [PATCH 7/7] Move argument parsing and sort imports --- .../mlgo/corpus/combine_training_corpus.py | 14 +- .../mlgo-utils/mlgo/corpus/extract_ir.py | 122 +++++++++--------- .../mlgo-utils/mlgo/corpus/make_corpus.py | 34 ++--- 3 files changed, 91 insertions(+), 79 deletions(-) diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py index cc21061cbbef5ea..3b2077b4c0e0e60 100644 --- a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py +++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py @@ -28,11 +28,7 @@ from mlgo.corpus import combine_training_corpus_lib -def main(args): - combine_training_corpus_lib.combine_corpus(args.root_dir) - - -if __name__ == "__main__": +def parse_args_and_run(): parser = argparse.ArgumentParser( description="A tool for combining multiple training corpora" ) @@ -41,3 +37,11 @@ def main(args): ) args = parser.parse_args() main(args) + + +def main(args): + combine_training_corpus_lib.combine_corpus(args.root_dir) + + +if __name__ == "__main__": + parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py index 4426463e22b0e74..94415431ab4a388 100644 --- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py +++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py @@ -24,71 +24,15 @@ any output. """ +import argparse import json -import multiprocessing import logging -import argparse +import multiprocessing from mlgo.corpus import extract_ir_lib -def main(args): - objs = [] - if args.input is not None and args.thinlto_build == "local": - raise ValueError("--thinlto_build=local cannot be run with --input") - if args.input is None: - if args.thinlto_build != "local": - raise ValueError("--input or --thinlto_build=local must be provided") - objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) - elif args.input_type == "json": - with open(args.input, encoding="utf-8") as f: - objs = extract_ir_lib.load_from_compile_commands( - json.load(f), args.output_dir - ) - elif args.input_type == "params": - if not args.obj_base_dir: - logging.info( - "-obj_base_dir is unspecified, assuming current directory." - "If no objects are found, use this option to specify the root" - "directory for the object file paths in the input file." - ) - with open(args.input, encoding="utf-8") as f: - objs = extract_ir_lib.load_from_lld_params( - [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir - ) - elif args.input_type == "directory": - logging.warning( - "Using the directory input is only recommended if the build system" - "your project uses does not support any structured output that" - "ml-compiler-opt understands. If your build system provides a" - "structured compilation database, use that instead" - ) - objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) - else: - logging.error("Unknown input type: %s", args.input_type) - - relative_output_paths = extract_ir_lib.run_extraction( - objs, - args.num_workers, - args.llvm_objcopy_path, - args.cmd_filter, - args.thinlto_build, - args.cmd_section_name, - args.bitcode_section_name, - ) - - extract_ir_lib.write_corpus_manifest( - args.thinlto_build, relative_output_paths, args.output_dir - ) - - logging.info( - "Converted %d files out of %d", - len(objs) - relative_output_paths.count(None), - len(objs), - ) - - -if __name__ == "__main__": +def parse_args_and_run(): parser = argparse.ArgumentParser( description="A tool for making a corpus from build artifacts" ) @@ -171,3 +115,63 @@ def main(args): ) args = parser.parse_args() main(args) + + +def main(args): + objs = [] + if args.input is not None and args.thinlto_build == "local": + raise ValueError("--thinlto_build=local cannot be run with --input") + if args.input is None: + if args.thinlto_build != "local": + raise ValueError("--input or --thinlto_build=local must be provided") + objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) + elif args.input_type == "json": + with open(args.input, encoding="utf-8") as f: + objs = extract_ir_lib.load_from_compile_commands( + json.load(f), args.output_dir + ) + elif args.input_type == "params": + if not args.obj_base_dir: + logging.info( + "-obj_base_dir is unspecified, assuming current directory." + "If no objects are found, use this option to specify the root" + "directory for the object file paths in the input file." + ) + with open(args.input, encoding="utf-8") as f: + objs = extract_ir_lib.load_from_lld_params( + [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir + ) + elif args.input_type == "directory": + logging.warning( + "Using the directory input is only recommended if the build system" + "your project uses does not support any structured output that" + "ml-compiler-opt understands. If your build system provides a" + "structured compilation database, use that instead" + ) + objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) + else: + logging.error("Unknown input type: %s", args.input_type) + + relative_output_paths = extract_ir_lib.run_extraction( + objs, + args.num_workers, + args.llvm_objcopy_path, + args.cmd_filter, + args.thinlto_build, + args.cmd_section_name, + args.bitcode_section_name, + ) + + extract_ir_lib.write_corpus_manifest( + args.thinlto_build, relative_output_paths, args.output_dir + ) + + logging.info( + "Converted %d files out of %d", + len(objs) - relative_output_paths.count(None), + len(objs), + ) + + +if __name__ == "__main__": + parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py index 05ceb750de673ef..221486e16c6e008 100644 --- a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py +++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py @@ -12,12 +12,29 @@ --default_args="<list of space separated flags>" """ -import logging import argparse +import logging from mlgo.corpus import make_corpus_lib +def parse_args_and_run(): + parser = argparse.ArgumentParser( + description="A tool for making a corpus from arbitrary bitcode" + ) + parser.add_argument("--input_dir", type=str, help="The input directory.") + parser.add_argument("--output_dir", type=str, help="The output directory.") + parser.add_argument( + "--default_args", + type=str, + help="The compiler flags to compile with when using downstream tooling.", + default="", + nargs="?", + ) + args = parser.parse_args() + main(args) + + def main(args): logging.warning( "Using this tool does not guarantee that the bitcode is taken at " @@ -33,17 +50,4 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="A tool for making a corpus from arbitrary bitcode" - ) - parser.add_argument("--input_dir", type=str, help="The input directory.") - parser.add_argument("--output_dir", type=str, help="The output directory.") - parser.add_argument( - "--default_args", - type=str, - help="The compiler flags to compile with when using downstream tooling.", - default="", - nargs="?", - ) - args = parser.parse_args() - main(args) + parse_args_and_run() _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits