https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/72319
>From c3f723c8a975cc5e075d56350645b0be486f3cda Mon Sep 17 00:00:00 2001 From: Aiden Grossman <agrossman...@yahoo.com> Date: Tue, 14 Nov 2023 14:20:24 -0800 Subject: [PATCH 1/2] [MLGO] Upstream the corpus extraction tooling --- llvm/py/Pyproject.toml | 1 + llvm/py/src/mlgo/combine_training_corpus.py | 55 +++ .../src/mlgo/combine_training_corpus_lib.py | 50 +++ .../src/mlgo/combine_training_corpus_test.py | 104 +++++ llvm/py/src/mlgo/extract_ir.py | 142 +++++++ llvm/py/src/mlgo/extract_ir_lib.py | 373 ++++++++++++++++++ llvm/py/src/mlgo/extract_ir_test.py | 231 +++++++++++ llvm/py/src/mlgo/make_corpus.py | 58 +++ llvm/py/src/mlgo/make_corpus_lib.py | 90 +++++ llvm/py/src/mlgo/make_corpus_test.py | 66 ++++ 10 files changed, 1170 insertions(+) create mode 100644 llvm/py/Pyproject.toml create mode 100644 llvm/py/src/mlgo/combine_training_corpus.py create mode 100644 llvm/py/src/mlgo/combine_training_corpus_lib.py create mode 100644 llvm/py/src/mlgo/combine_training_corpus_test.py create mode 100644 llvm/py/src/mlgo/extract_ir.py create mode 100644 llvm/py/src/mlgo/extract_ir_lib.py create mode 100644 llvm/py/src/mlgo/extract_ir_test.py create mode 100644 llvm/py/src/mlgo/make_corpus.py create mode 100644 llvm/py/src/mlgo/make_corpus_lib.py create mode 100644 llvm/py/src/mlgo/make_corpus_test.py diff --git a/llvm/py/Pyproject.toml b/llvm/py/Pyproject.toml new file mode 100644 index 00000000000000..dcf2c804da5e19 --- /dev/null +++ b/llvm/py/Pyproject.toml @@ -0,0 +1 @@ +# Placeholder diff --git a/llvm/py/src/mlgo/combine_training_corpus.py b/llvm/py/src/mlgo/combine_training_corpus.py new file mode 100644 index 00000000000000..94ee1cbac9cea4 --- /dev/null +++ b/llvm/py/src/mlgo/combine_training_corpus.py @@ -0,0 +1,55 @@ +# coding=utf-8 +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +r"""Combine multiple training corpus into a single training corpus. + +Currently only support the case that multiple corpus share the same +configurables except the "modules" field. + +Usage: we'd like to combine training corpus corpus1 and corpus2 into +combinedcorpus; we first structure the files as follows: + +combinedcorpus +combinedcorpus/corpus1 +combinedcorpus/corpus2 + +Running this script with + +python3 \ +compiler_opt/tools/combine_training_corpus.py \ + --root_dir=$PATH_TO_combinedcorpus + +generates combinedcorpus/corpus_description.json file. In this way corpus1 +and corpus2 are combined into combinedcorpus. +""" + +from absl import app +from absl import flags + +from compiler_opt.tools import combine_training_corpus_lib + +flags.DEFINE_string('root_dir', '', 'root dir of module paths to combine.') + +FLAGS = flags.FLAGS + + +def main(argv): + if len(argv) > 1: + raise app.UsageError('Too many command-line arguments.') + + combine_training_corpus_lib.combine_corpus(FLAGS.root_dir) + + +if __name__ == '__main__': + app.run(main) diff --git a/llvm/py/src/mlgo/combine_training_corpus_lib.py b/llvm/py/src/mlgo/combine_training_corpus_lib.py new file mode 100644 index 00000000000000..0359961266a240 --- /dev/null +++ b/llvm/py/src/mlgo/combine_training_corpus_lib.py @@ -0,0 +1,50 @@ +# coding=utf-8 +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Library for combining training corpora.""" + +import os +import json + +from absl import logging + +import tensorflow as tf + +_FILE_NAME = 'corpus_description.json' + + +def combine_corpus(root_dir: str) -> None: + module_names = [] + output_corpus_description = {} + + corpus_description_glob = os.path.join(root_dir, '*/' + _FILE_NAME) + for corpus_description_path in tf.io.gfile.glob(corpus_description_glob): + logging.info('processing %s', corpus_description_path) + + with tf.io.gfile.GFile(corpus_description_path, 'r') as f: + corpus_description = json.load(f) + sub_dir = os.path.basename(os.path.dirname(corpus_description_path)) + module_names.extend([ + os.path.join(sub_dir, name) for name in corpus_description['modules'] + ]) + del corpus_description['modules'] + if len(output_corpus_description) == 0: + output_corpus_description = corpus_description + elif corpus_description != output_corpus_description: + raise ValueError('Input corpora differ by more than modules.') + + output_corpus_description['modules'] = module_names + + with tf.io.gfile.GFile(os.path.join(root_dir, _FILE_NAME), 'w') as f: + json.dump(output_corpus_description, f, indent=2) diff --git a/llvm/py/src/mlgo/combine_training_corpus_test.py b/llvm/py/src/mlgo/combine_training_corpus_test.py new file mode 100644 index 00000000000000..47dd602967b68f --- /dev/null +++ b/llvm/py/src/mlgo/combine_training_corpus_test.py @@ -0,0 +1,104 @@ +# coding=utf-8 +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for combining training corpora.""" + +import json +import os + +from absl.testing import absltest + +from compiler_opt.tools import combine_training_corpus_lib + + +class CombineTrainingCorpusTest(absltest.TestCase): + + def test_combine_corpus(self): + corpus_dir = self.create_tempdir() + subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1') + subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2') + subcorpus1_description = { + 'has_thinlto': False, + 'modules': ['test1.o', 'test2.o'] + } + subcorpus2_description = { + 'has_thinlto': False, + 'modules': ['test3.o', 'test4.o'] + } + subcorpus1_description_file = subcorpus1_dir.create_file( + file_path='corpus_description.json') + subcorpus2_description_file = subcorpus2_dir.create_file( + file_path='corpus_description.json') + subcorpus1_description_file.write_text(json.dumps(subcorpus1_description)) + subcorpus2_description_file.write_text(json.dumps(subcorpus2_description)) + combine_training_corpus_lib.combine_corpus(corpus_dir.full_path) + with open( + os.path.join(corpus_dir, 'corpus_description.json'), + encoding='utf-8') as combined_corpus_description_file: + combined_corpus_description = json.load(combined_corpus_description_file) + self.assertEqual(combined_corpus_description['has_thinlto'], False) + self.assertLen(combined_corpus_description['modules'], 4) + self.assertIn('subcorpus1/test1.o', combined_corpus_description['modules']) + self.assertIn('subcorpus1/test2.o', combined_corpus_description['modules']) + self.assertIn('subcorpus2/test3.o', combined_corpus_description['modules']) + self.assertIn('subcorpus2/test4.o', combined_corpus_description['modules']) + + def test_empty_folder(self): + corpus_dir = self.create_tempdir() + subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1') + _ = corpus_dir.mkdir(dir_path='empty_dir') + subcorpus1_description = {'modules': ['test1.o', 'test2.o']} + subcorpus1_description_file = subcorpus1_dir.create_file( + file_path='corpus_description.json') + subcorpus1_description_file.write_text(json.dumps(subcorpus1_description)) + combine_training_corpus_lib.combine_corpus(corpus_dir.full_path) + with open( + os.path.join(corpus_dir, 'corpus_description.json'), + encoding='utf-8') as combined_corpus_description_file: + combined_corpus_description = json.load(combined_corpus_description_file) + self.assertLen(combined_corpus_description['modules'], 2) + + def test_ignore_extra_file(self): + corpus_dir = self.create_tempdir() + subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1') + _ = corpus_dir.create_file(file_path='empty.log') + subcorpus1_description = {'modules': ['test1.o', 'test2.o']} + subcorpus1_description_file = subcorpus1_dir.create_file( + file_path='corpus_description.json') + subcorpus1_description_file.write_text(json.dumps(subcorpus1_description)) + combine_training_corpus_lib.combine_corpus(corpus_dir.full_path) + with open( + os.path.join(corpus_dir, 'corpus_description.json'), + encoding='utf-8') as combined_corpus_description_file: + combined_corpus_description = json.load(combined_corpus_description_file) + self.assertLen(combined_corpus_description['modules'], 2) + + def test_different_corpora(self): + corpus_dir = self.create_tempdir() + subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1') + subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2') + subcorpus1_description = {'has_thinlto': False, 'modules': ['test1.o']} + subcorpus2_description = {'has_thinlto': True, 'modules': ['test2.o']} + subcorpus1_description_file = subcorpus1_dir.create_file( + file_path='corpus_description.json') + subcorpus2_description_file = subcorpus2_dir.create_file( + file_path='corpus_description.json') + subcorpus1_description_file.write_text(json.dumps(subcorpus1_description)) + subcorpus2_description_file.write_text(json.dumps(subcorpus2_description)) + self.assertRaises(ValueError, combine_training_corpus_lib.combine_corpus, + corpus_dir.full_path) + + +if __name__ == '__main__': + absltest.main() diff --git a/llvm/py/src/mlgo/extract_ir.py b/llvm/py/src/mlgo/extract_ir.py new file mode 100644 index 00000000000000..2a1ef3978888d6 --- /dev/null +++ b/llvm/py/src/mlgo/extract_ir.py @@ -0,0 +1,142 @@ +# coding=utf-8 +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Extract IR for training. + +Extract IR for training, either from a compile_commands.json file produced by +cmake, or a linker parameter list file. + +Only run with +'python compiler_opt/tools/extract_ir.py ...' + +The compilation is assumed to have been performed with clang, using +-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all) + +In a distributed ThinLTO case, the compilation is assumed to have been performed +specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt. + +In a local ThinLTO case, the compilation is assumedto have been performed +specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files + +To change the logging verbosity, pass an integer representing the desired +verbosity to the --verbosity flag. Use 0 for all logs, status information, +and detailed debug information, -1 for solely warnings, and -2 to not produce +any output. +""" + +import json +import multiprocessing + +from absl import app +from absl import flags +from absl import logging + +from compiler_opt.tools import extract_ir_lib + +flags.DEFINE_string( + 'input', None, + 'Input file or directory - either compile_commands.json, a linker parameter' + 'list, or a path to a directory containing object files.') +flags.DEFINE_enum( + 'input_type', 'json', ['json', 'params', 'directory'], + 'Input file type - json, params, or directory. params latter refers to lld' + 'params.') +flags.DEFINE_string('output_dir', None, 'Output directory') +flags.DEFINE_integer( + 'num_workers', None, + 'Number of parallel workers for objcopy. `None` for maximum available.') +flags.DEFINE_string('llvm_objcopy_path', 'llvm-objcopy', 'Path to llvm-objcopy') +flags.DEFINE_string( + 'obj_base_dir', '', + 'Base directory for object files. Defaults to current working dir.') +flags.DEFINE_string( + 'cmd_filter', None, + 'Include only those modules with a command line matching this regexp. ' + 'Setting it to None for not filtering. Note that the regexp is applied ' + 'independently for each separate command line option. For example, ^-Oz$ ' + 'will match Oz - built binaries. Does not work with thinlto_build=lld.') +flags.DEFINE_enum( + 'thinlto_build', None, ['distributed', 'local'], + 'Set if the build was performed with either \'distributed\' or ' + '\'local\' ThinLTO. This ensures the thinlto.bc files are also copied. ' + 'The build is assumed to have had ' + '-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed ' + 'case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files ' + 'passed in the local case.') +flags.DEFINE_string( + 'cmd_section_name', '.llvmcmd', + 'The section name passed to llvm-objcopy. For ELF object files, the ' + 'default .llvmcmd is correct. For Mach-O object files, one should use ' + 'something like __LLVM,__cmdline') +flags.DEFINE_string( + 'bitcode_section_name', '.llvmbc', + 'The section name passed to llvm-objcopy. For ELF object files, the ' + 'default .llvmbc is correct. For Mach-O object files, one should use ' + '__LLVM,__bitcode') + +flags.mark_flag_as_required('output_dir') + +FLAGS = flags.FLAGS + + +def main(argv): + if len(argv) > 1: + raise app.UsageError('Too many command-line arguments.') + + objs = [] + if FLAGS.input is not None and FLAGS.thinlto_build == 'local': + raise ValueError('--thinlto_build=local cannot be run with --input') + if FLAGS.input is None: + if FLAGS.thinlto_build != 'local': + raise ValueError('--input or --thinlto_build=local must be provided') + objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir, + FLAGS.output_dir) + elif FLAGS.input_type == 'json': + with open(FLAGS.input, encoding='utf-8') as f: + objs = extract_ir_lib.load_from_compile_commands( + json.load(f), FLAGS.output_dir) + elif FLAGS.input_type == 'params': + if not FLAGS.obj_base_dir: + logging.info( + '-obj_base_dir is unspecified, assuming current directory.' + 'If no objects are found, use this option to specify the root' + 'directory for the object file paths in the input file.') + with open(FLAGS.input, encoding='utf-8') as f: + objs = extract_ir_lib.load_from_lld_params( + [l.strip() for l in f.readlines()], FLAGS.obj_base_dir, + FLAGS.output_dir) + elif FLAGS.input_type == 'directory': + logging.warning( + 'Using the directory input is only recommended if the build system' + 'your project uses does not support any structured output that' + 'ml-compiler-opt understands. If your build system provides a' + 'structured compilation database, use that instead') + objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir) + else: + logging.error('Unknown input type: %s', FLAGS.input_type) + + relative_output_paths = extract_ir_lib.run_extraction( + objs, FLAGS.num_workers, FLAGS.llvm_objcopy_path, FLAGS.cmd_filter, + FLAGS.thinlto_build, FLAGS.cmd_section_name, FLAGS.bitcode_section_name) + + extract_ir_lib.write_corpus_manifest(FLAGS.thinlto_build, + relative_output_paths, FLAGS.output_dir) + + logging.info('Converted %d files out of %d', + len(objs) - relative_output_paths.count(None), len(objs)) + + +if __name__ == '__main__': + multiprocessing.set_start_method('fork') + app.run(main) diff --git a/llvm/py/src/mlgo/extract_ir_lib.py b/llvm/py/src/mlgo/extract_ir_lib.py new file mode 100644 index 00000000000000..c1d2a54b9a9e7c --- /dev/null +++ b/llvm/py/src/mlgo/extract_ir_lib.py @@ -0,0 +1,373 @@ +# coding=utf-8 +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Library functions for IR extraction.""" + +import os +import pathlib +import re +import shutil +import subprocess +import multiprocessing +import functools +import json + +from typing import Dict, List, Optional + +from absl import logging + +from compiler_opt.rl import constant + + +# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a +# \0 - separated list of strings, to a \n one. +def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool: + """Determine if the module should be included.""" + if match_regexp is None: + return True + lines = cmdline.split('\0') + return any(len(re.findall(match_regexp, l)) for l in lines) + + +def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]: + opts = cmdline.split('\0') + for option in opts: + if option.startswith('-fthinlto-index'): + return os.path.join(basedir, option.split('=')[1]) + return None + + +class TrainingIRExtractor: + """IR and command line extraction from an object file.""" + + def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None): + """Set up a TrainingIRExtractor. + + Args: + obj_relative_path: relative path to the input object file. It will be also + used to construct the absolute path of the output IR and cmd files, by + appending it to output_base_dir. + output_base_dir: the directory under which the output will be produced. + obj_base_dir: the base directory for all the input object files. + """ + self._obj_relative_path = obj_relative_path + self._output_base_dir = output_base_dir + self._obj_base_dir = obj_base_dir if obj_base_dir is not None else '' + + def obj_base_dir(self): + return self._obj_base_dir + + def output_base_dir(self): + return self._output_base_dir + + def relative_output_path(self): + return self._obj_relative_path + + def input_obj(self): + return os.path.join(self.obj_base_dir(), self._obj_relative_path) + + def lld_src_bc(self): + # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport') + # IR bitcode saved by lld. It is hardcoded into lld. + return os.path.join(self._obj_base_dir, + self._obj_relative_path + '.3.import.bc') + + def lld_src_thinlto(self): + return os.path.join(self._obj_base_dir, + self._obj_relative_path + '.thinlto.bc') + + def dest_dir(self): + return os.path.join(self.output_base_dir(), + os.path.dirname(self._obj_relative_path)) + + def module_name(self): + return os.path.basename(self._obj_relative_path) + + def cmd_file(self): + return os.path.join(self.dest_dir(), self.module_name() + '.cmd') + + def bc_file(self): + return os.path.join(self.dest_dir(), self.module_name() + '.bc') + + def thinlto_index_file(self): + return os.path.join(self.dest_dir(), self.module_name() + '.thinlto.bc') + + def _get_extraction_cmd_command(self, llvm_objcopy_path: str, + cmd_section_name: str): + """Get llvm-objcopy and process args to a produce a command string that, + when invoked, will extract the cmd section info ths self.cmd_file() file. + """ + return [ + llvm_objcopy_path, + '--dump-section=' + cmd_section_name + '=' + self.cmd_file(), + self.input_obj(), '/dev/null' + ] + + def _get_extraction_bc_command(self, llvm_objcopy_path: str, + bitcode_section_name: str): + """Gets llvm-objcopy and process args to produce a command string that, + when invoked, will extract the bitcode section into the self.bc_file() + file. + """ + return [ + llvm_objcopy_path, + '--dump-section=' + bitcode_section_name + '=' + self.bc_file(), + self.input_obj(), '/dev/null' + ] + + def _extract_clang_artifacts(self, llvm_objcopy_path: str, cmd_filter: str, + is_thinlto: bool, cmd_section_name: str, + bitcode_section_name: str) -> Optional[str]: + """Run llvm-objcopy to extract the .bc and command line.""" + if not os.path.exists(self.input_obj()): + logging.info('%s does not exist.', self.input_obj()) + return None + os.makedirs(self.dest_dir(), exist_ok=True) + try: + subprocess.check_output( + self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name), + stderr=subprocess.STDOUT, + encoding='utf-8') + if cmd_filter is not None or is_thinlto: + with open(self.cmd_file(), encoding='utf-8') as f: + lines = f.readlines() + assert len(lines) == 1 + cmdline = lines[0] + if not should_include_module(cmdline, cmd_filter): + logging.info( + 'Excluding module %s because it does not match the filter', + self.input_obj()) + os.remove(self.cmd_file()) + return None + if is_thinlto: + index_file = get_thinlto_index(cmdline, self.obj_base_dir()) + shutil.copy(index_file, self.thinlto_index_file()) + + subprocess.check_output( + self._get_extraction_bc_command(llvm_objcopy_path, + bitcode_section_name), + stderr=subprocess.STDOUT, + encoding='utf-8') + except subprocess.CalledProcessError as e: + # This may happen if .o file was build from asm (.S source). + logging.warning('%s was not processed: %s', self.input_obj(), e) + logging.info(e.output) + return None + assert (os.path.exists(self.cmd_file()) and + os.path.exists(self.bc_file()) and + (not is_thinlto or os.path.exists(self.thinlto_index_file()))) + return self.relative_output_path() + + def _extract_lld_artifacts(self) -> Optional[str]: + """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation. + """ + if not os.path.exists(self.lld_src_bc()): + logging.info('%s does not exist.', self.lld_src_bc()) + return None + if not os.path.exists(self.lld_src_thinlto()): + logging.info('%s does not exist.', self.lld_src_thinlto()) + return None + os.makedirs(self.dest_dir(), exist_ok=True) + + # Copy over the files + shutil.copy(self.lld_src_bc(), self.bc_file()) + shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file()) + + assert os.path.exists(self.bc_file()) + assert os.path.exists(self.thinlto_index_file()) + return self._obj_relative_path + + def extract(self, + llvm_objcopy_path: Optional[str] = None, + cmd_filter: Optional[str] = None, + thinlto_build: Optional[str] = None, + cmd_section_name: Optional[str] = '.llvmcmd', + bitcode_section_name: Optional[str] = '.llvmbc') -> Optional[str]: + if thinlto_build == 'local': + return self._extract_lld_artifacts() + return self._extract_clang_artifacts( + llvm_objcopy_path=llvm_objcopy_path, + cmd_filter=cmd_filter, + is_thinlto=thinlto_build == 'distributed', + cmd_section_name=cmd_section_name, + bitcode_section_name=bitcode_section_name) + + +def convert_compile_command_to_objectfile( + command: Dict[str, str], output_dir: str) -> Optional[TrainingIRExtractor]: + obj_base_dir = command['directory'] + if 'arguments' in command: + cmd_parts = command['arguments'] + elif 'command' in command: + cmd_parts = command['command'].split() + else: + logging.info('compile_commands element has no command and arguments') + return None + + try: + obj_index = cmd_parts.index('-o') + 1 + except ValueError: + # This could happen if there are non-clang commands in compile_commands.json + logging.info('Command has no -o option: %s', ' '.join(cmd_parts)) + return None + obj_rel_path = cmd_parts[obj_index] + # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files? + return TrainingIRExtractor( + obj_relative_path=obj_rel_path, + output_base_dir=output_dir, + obj_base_dir=obj_base_dir) + + +def load_from_compile_commands(json_array: List[Dict[str, str]], + output_dir: str) -> List[TrainingIRExtractor]: + objs = [ + convert_compile_command_to_objectfile(cmd, output_dir) + for cmd in json_array + ] + # Filter out None, in case there were non-clang commands in the .json + return [obj for obj in objs if obj is not None] + + +def load_from_lld_params(params_array: List[str], obj_base_dir: str, + output_dir: str) -> List[TrainingIRExtractor]: + """Create an ObjectFile array based on lld's parameters.""" + # yank out -o and the output. After that, anything not starting with '-', and + # ending in a '.o', is an object file. + try: + minus_o_idx = params_array.index('-o') + del params_array[minus_o_idx:minus_o_idx + 2] + just_obj_paths = [ + o for o in params_array if not o.startswith('-') and o.endswith('.o') + ] + except ValueError: + logging.info('This params file does not have an explicit -o option.') + just_obj_paths = params_array + + def make_obj(obj_file: str) -> TrainingIRExtractor: + return TrainingIRExtractor( + obj_relative_path=obj_file, + output_base_dir=output_dir, + obj_base_dir=obj_base_dir) + + return [make_obj(obj_file) for obj_file in just_obj_paths] + + +def load_from_directory(obj_base_dir: str, + output_dir: str) -> List[TrainingIRExtractor]: + """Create an object file array by globbing an entire drectory. + + Args: + obj_base_dir: The base build directory that all object files will be + written out as being relative to. + output_dir: The output directory where extracted .bc and .cmd files should + be placed. + """ + paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.o')] + + def make_spec(obj_file: str): + return TrainingIRExtractor( + obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir), + output_base_dir=output_dir, + obj_base_dir=obj_base_dir) + + return [make_spec(path) for path in paths] + + +def load_for_lld_thinlto(obj_base_dir: str, + output_dir: str) -> List[TrainingIRExtractor]: + # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport') + # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files + # are also emitted next to the postimport bitcode, with the suffix + # .thinlto.bc instead + paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.3.import.bc')] + + def make_spec(obj_file: str): + return TrainingIRExtractor( + # Cut away .3.import.bc + obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12], + output_base_dir=output_dir, + obj_base_dir=obj_base_dir) + + return [make_spec(path) for path in paths] + + +def run_extraction(objs: List[TrainingIRExtractor], num_workers: int, + llvm_objcopy_path: str, cmd_filter: str, thinlto_build: str, + cmd_section_name: str, bitcode_section_name: str): + """Extracts all specified object files into the corpus directory. + + Args: + objs: A list of TrainingIRExtractor Objects that represent the object files + to extract bitcode/commands from. + num_workers: The number of parallel processes to spawn to run the + extraction. + llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections. + cmd_filter: A regular expression that is used to select for compilations + performed with specific flags. If you want to include all compilations, + set this to None. + thinlto_build: Whether or not this is a ThinLTO build, and if so, the type. + Set this to None if the build was not done with ThinLTO. + cmd_section_name: The name of the command line section created by the + bitcode embedding. + bitcode_section_name: The name of the bitcode section created by the + bitcode embedding. + """ + extract_artifacts = functools.partial( + TrainingIRExtractor.extract, + llvm_objcopy_path=llvm_objcopy_path, + cmd_filter=cmd_filter, + thinlto_build=thinlto_build, + cmd_section_name=cmd_section_name, + bitcode_section_name=bitcode_section_name) + + with multiprocessing.Pool(num_workers) as pool: + relative_output_paths = pool.map(extract_artifacts, objs) + pool.close() + pool.join() + return relative_output_paths + + +def write_corpus_manifest(thinlto_build: str, relative_output_paths: List[str], + output_dir: str): + """Writes a corpus_manifest.json containing all necessary information about + the corpus. + + Args: + thinlto_build: Whether or not the build was done with ThinLTO and if so, + what kind of ThinLTO. Set this to none if the build was not performed with + ThinLTO. + relative_output_paths: The relative (to the corpus directory) output paths + of all the bitcode files that should be placed in the corpus manifest + output_dir: The corpus directory where the corpus manifest should be + placed. + """ + # This comes first rather than later so global_command_override is at the top + # of the .json after being written + if thinlto_build == 'local': + corpus_description = { + 'global_command_override': constant.UNSPECIFIED_OVERRIDE + } + else: + corpus_description = {} + + corpus_description.update({ + 'has_thinlto': thinlto_build is not None, + 'modules': [path for path in relative_output_paths if path is not None] + }) + + with open( + os.path.join(output_dir, 'corpus_description.json'), + 'w', + encoding='utf-8') as f: + json.dump(corpus_description, f, indent=2) diff --git a/llvm/py/src/mlgo/extract_ir_test.py b/llvm/py/src/mlgo/extract_ir_test.py new file mode 100644 index 00000000000000..8811134aab4fce --- /dev/null +++ b/llvm/py/src/mlgo/extract_ir_test.py @@ -0,0 +1,231 @@ +# coding=utf-8 +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for compiler_opt.tools.extract_ir.""" + +# pylint: disable=protected-access +import os.path + +from absl.testing import absltest + +from compiler_opt.tools import extract_ir_lib + + +class ExtractIrTest(absltest.TestCase): + + def test_one_conversion(self): + obj = extract_ir_lib.convert_compile_command_to_objectfile( + { + 'directory': '/output/directory', + 'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o', + 'file': '/some/path/lib/foo/bar.cc' + }, '/corpus/destination/path') + self.assertIsNotNone(obj) + # pytype: disable=attribute-error + # Pytype complains about obj being None + self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o') + self.assertEqual(obj.relative_output_path(), 'lib/bar.o') + self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd') + self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc') + self.assertEqual(obj.thinlto_index_file(), + '/corpus/destination/path/lib/bar.o.thinlto.bc') + # pytype: enable=attribute-error + + def test_one_conversion_arguments_style(self): + obj = extract_ir_lib.convert_compile_command_to_objectfile( + { + 'directory': '/output/directory', + 'arguments': + ['-cc1', '-c', '/some/path/lib/foo/bar.cc', '-o', 'lib/bar.o'], + 'file': '/some/path/lib/foo/bar.cc' + }, '/corpus/destination/path') + self.assertIsNotNone(obj) + # pytype: disable=attribute-error + # Pytype complains about obj being None + self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o') + self.assertEqual(obj.relative_output_path(), 'lib/bar.o') + self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd') + self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc') + self.assertEqual(obj.thinlto_index_file(), + '/corpus/destination/path/lib/bar.o.thinlto.bc') + # pytype: enable=attribute-error + + def test_arr_conversion(self): + res = extract_ir_lib.load_from_compile_commands([{ + 'directory': '/output/directory', + 'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o', + 'file': '/some/path/lib/foo/bar.cc' + }, { + 'directory': '/output/directory', + 'command': '-cc1 -c /some/path/lib/foo/baz.cc -o lib/other/baz.o', + 'file': '/some/path/lib/foo/baz.cc' + }], '/corpus/destination/path') + res = list(res) + self.assertLen(res, 2) + self.assertEqual(res[0].input_obj(), '/output/directory/lib/bar.o') + self.assertEqual(res[0].relative_output_path(), 'lib/bar.o') + self.assertEqual(res[0].cmd_file(), + '/corpus/destination/path/lib/bar.o.cmd') + self.assertEqual(res[0].bc_file(), '/corpus/destination/path/lib/bar.o.bc') + self.assertEqual(res[0].thinlto_index_file(), + '/corpus/destination/path/lib/bar.o.thinlto.bc') + + self.assertEqual(res[1].input_obj(), '/output/directory/lib/other/baz.o') + self.assertEqual(res[1].relative_output_path(), 'lib/other/baz.o') + self.assertEqual(res[1].cmd_file(), + '/corpus/destination/path/lib/other/baz.o.cmd') + self.assertEqual(res[1].bc_file(), + '/corpus/destination/path/lib/other/baz.o.bc') + self.assertEqual(res[1].thinlto_index_file(), + '/corpus/destination/path/lib/other/baz.o.thinlto.bc') + + def test_command_extraction(self): + obj = extract_ir_lib.TrainingIRExtractor( + obj_relative_path='lib/obj_file.o', + output_base_dir='/where/corpus/goes', + obj_base_dir='/foo/bar') + self.assertEqual( + obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [ + '/bin/llvm_objcopy_path', + '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd', + '/foo/bar/lib/obj_file.o', '/dev/null' + ]) + self.assertEqual( + obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [ + '/bin/llvm_objcopy_path', + '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc', + '/foo/bar/lib/obj_file.o', '/dev/null' + ]) + + def test_command_extraction_no_basedir(self): + obj = extract_ir_lib.TrainingIRExtractor('lib/obj_file.o', + '/where/corpus/goes') + self.assertEqual( + obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [ + '/bin/llvm_objcopy_path', + '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd', + 'lib/obj_file.o', '/dev/null' + ]) + self.assertEqual( + obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [ + '/bin/llvm_objcopy_path', + '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc', + 'lib/obj_file.o', '/dev/null' + ]) + + def test_lld_params(self): + lld_opts = [ + '-o', 'output/dir/exe', 'lib/obj1.o', 'somelib.a', '-W,blah', + 'lib/dir/obj2.o' + ] + obj = extract_ir_lib.load_from_lld_params(lld_opts, '/some/path', + '/tmp/out') + self.assertLen(obj, 2) + self.assertEqual(obj[0].input_obj(), '/some/path/lib/obj1.o') + self.assertEqual(obj[0].relative_output_path(), 'lib/obj1.o') + self.assertEqual(obj[0].cmd_file(), '/tmp/out/lib/obj1.o.cmd') + self.assertEqual(obj[0].thinlto_index_file(), + '/tmp/out/lib/obj1.o.thinlto.bc') + self.assertEqual(obj[1].input_obj(), '/some/path/lib/dir/obj2.o') + + def test_load_from_directory(self): + tempdir = self.create_tempdir() + subdir = tempdir.mkdir(dir_path='subdir') + subdir.create_file(file_path='test1.o') + subdir.create_file(file_path='test2.o') + outdir = self.create_tempdir() + objs = extract_ir_lib.load_from_directory(tempdir.full_path, + outdir.full_path) + self.assertLen(objs, 2) + for index, obj in enumerate( + sorted(objs, key=lambda x: x._obj_relative_path)): + self.assertEqual(obj._obj_relative_path, f'subdir/test{index + 1:d}.o') + self.assertEqual(obj._obj_base_dir, tempdir.full_path) + self.assertEqual(obj._output_base_dir, outdir.full_path) + + def test_lld_thinlto_discovery(self): + tempdir = self.create_tempdir() + tempdir.create_file(file_path='1.3.import.bc') + tempdir.create_file(file_path='2.3.import.bc') + tempdir.create_file(file_path='3.3.import.bc') + tempdir.create_file(file_path='1.thinlto.bc') + tempdir.create_file(file_path='2.thinlto.bc') + tempdir.create_file(file_path='3.thinlto.bc') + outdir = self.create_tempdir() + obj = extract_ir_lib.load_for_lld_thinlto(tempdir.full_path, + outdir.full_path) + self.assertLen(obj, 3) + for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)): + self.assertEqual(o._obj_relative_path, f'{i + 1:d}') + self.assertEqual(o._obj_base_dir, tempdir.full_path) + self.assertEqual(o._output_base_dir, outdir.full_path) + + def test_lld_thinlto_discovery_nested(self): + outer = self.create_tempdir() + tempdir = outer.mkdir(dir_path='nest') + tempdir.create_file(file_path='1.3.import.bc') + tempdir.create_file(file_path='2.3.import.bc') + tempdir.create_file(file_path='3.3.import.bc') + tempdir.create_file(file_path='1.thinlto.bc') + tempdir.create_file(file_path='2.thinlto.bc') + tempdir.create_file(file_path='3.thinlto.bc') + outdir = self.create_tempdir() + obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path) + self.assertLen(obj, 3) + for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)): + self.assertEqual(o._obj_relative_path, f'nest/{i + 1:d}') + self.assertEqual(o._obj_base_dir, outer.full_path) + self.assertEqual(o._output_base_dir, outdir.full_path) + + def test_lld_thinlto_extraction(self): + outer = self.create_tempdir() + tempdir = outer.mkdir(dir_path='nest') + tempdir.create_file(file_path='1.3.import.bc') + tempdir.create_file(file_path='2.3.import.bc') + tempdir.create_file(file_path='3.3.import.bc') + tempdir.create_file(file_path='1.thinlto.bc') + tempdir.create_file(file_path='2.thinlto.bc') + tempdir.create_file(file_path='3.thinlto.bc') + outdir = self.create_tempdir() + obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path) + for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)): + mod_path = o.extract(thinlto_build='local') + self.assertEqual(mod_path, f'nest/{i + 1:d}') + self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/1.bc'))) + self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/2.bc'))) + self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/3.bc'))) + self.assertTrue( + os.path.exists(os.path.join(outdir.full_path, 'nest/1.thinlto.bc'))) + self.assertTrue( + os.path.exists(os.path.join(outdir.full_path, 'nest/2.thinlto.bc'))) + self.assertTrue( + os.path.exists(os.path.join(outdir.full_path, 'nest/3.thinlto.bc'))) + + def test_filtering(self): + cmdline = '-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o' + self.assertTrue(extract_ir_lib.should_include_module(cmdline, None)) + self.assertTrue(extract_ir_lib.should_include_module(cmdline, '.*')) + self.assertTrue(extract_ir_lib.should_include_module(cmdline, '^-Oz$')) + self.assertFalse(extract_ir_lib.should_include_module(cmdline, '^-O3$')) + + def test_thinlto_index_extractor(self): + cmdline = ('-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/' + 'out.o\0-fthinlto-index=foo/bar.thinlto.bc') + self.assertEqual( + extract_ir_lib.get_thinlto_index(cmdline, '/the/base/dir'), + '/the/base/dir/foo/bar.thinlto.bc') + + +if __name__ == '__main__': + absltest.main() diff --git a/llvm/py/src/mlgo/make_corpus.py b/llvm/py/src/mlgo/make_corpus.py new file mode 100644 index 00000000000000..24493d894be723 --- /dev/null +++ b/llvm/py/src/mlgo/make_corpus.py @@ -0,0 +1,58 @@ +# coding=utf-8 +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tool for making a corpus from arbitrary bitcode. + +To create a corpus from a set of bitcode files in an input directory, run +the following command: + +PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \ + --input_dir=<path to input directory> \ + --output_dir=<path to output directory> \ + --default_args="<list of space separated flags>" +""" + +from absl import app +from absl import flags +from absl import logging + +from compiler_opt.tools import make_corpus_lib + +flags.DEFINE_string('input_dir', None, 'The input directory.') +flags.DEFINE_string('output_dir', None, 'The output directory.') +flags.DEFINE_string( + 'default_args', '', + 'The compiler flags to compile with when using downstream tooling.') + +flags.mark_flag_as_required('input_dir') +flags.mark_flag_as_required('output_dir') + +FLAGS = flags.FLAGS + + +def main(_): + logging.warning( + 'Using this tool does not guarantee that the bitcode is taken at ' + 'the correct stage for consumption during model training. Make ' + 'sure to validate assumptions about where the bitcode is coming ' + 'from before using it in production.') + relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir) + make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir, + FLAGS.output_dir) + make_corpus_lib.write_corpus_manifest(relative_paths, FLAGS.output_dir, + FLAGS.default_args.split()) + + +if __name__ == '__main__': + app.run(main) diff --git a/llvm/py/src/mlgo/make_corpus_lib.py b/llvm/py/src/mlgo/make_corpus_lib.py new file mode 100644 index 00000000000000..3598fc12a04d14 --- /dev/null +++ b/llvm/py/src/mlgo/make_corpus_lib.py @@ -0,0 +1,90 @@ +# coding=utf-8 +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Library functions for making a corpus from arbitrary bitcode.""" + +import pathlib +import os +import shutil +import json + +from typing import List, Optional + +BITCODE_EXTENSION = '.bc' + + +def load_bitcode_from_directory(bitcode_base_dir: str) -> List[str]: + """Finds bitcode files to extract from a given directory. + + Args: + bitcode_base_dir: The base directory where the bitcode to be copied + is from. + output_dir: The directory to place the bitcode in. + + Returns an array of paths representing the relative path to the bitcode + file from the base direcotry. + """ + paths = [ + str(p)[:-len(BITCODE_EXTENSION)] + for p in pathlib.Path(bitcode_base_dir).glob('**/*' + BITCODE_EXTENSION) + ] + + return [ + os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths + ] + + +def copy_bitcode(relative_paths: List[str], bitcode_base_dir: str, + output_dir: str) -> None: + """Copies bitcode files from the base directory to the output directory. + + Args: + relative_paths: An array of relative paths to bitcode files that are copied + over to the output directory, preserving relative location. + bitcode_base_dir: The base directory where the bitcode is located. + output_dir: The output directory to place the bitcode in. + """ + for relative_path in relative_paths: + base_path = os.path.join(bitcode_base_dir, + relative_path + BITCODE_EXTENSION) + destination_path = os.path.join(output_dir, + relative_path + BITCODE_EXTENSION) + os.makedirs(os.path.dirname(destination_path), exist_ok=True) + shutil.copy(base_path, destination_path) + + +def write_corpus_manifest(relative_output_paths: List[str], + output_dir: str, + default_args: Optional[List[str]] = None) -> None: + """Creates a corpus manifest describing the bitcode that has been found. + + Args: + relative_output_paths: A list of paths to each bitcode file relative to the + output directory. + outout_dir: The output directory where the corpus is being created. + default_args: An array of compiler flags that should be used to compile + the bitcode when using further downstream tooling.""" + if default_args is None: + default_args = [] + corpus_description = { + 'global_command_override': default_args, + 'has_thinlto': False, + 'modules': [path for path in relative_output_paths if path is not None] + } + + with open( + os.path.join(output_dir, 'corpus_description.json'), + 'w', + encoding='utf-8') as description_file: + json.dump(corpus_description, description_file, indent=2) diff --git a/llvm/py/src/mlgo/make_corpus_test.py b/llvm/py/src/mlgo/make_corpus_test.py new file mode 100644 index 00000000000000..8ed598695d06ee --- /dev/null +++ b/llvm/py/src/mlgo/make_corpus_test.py @@ -0,0 +1,66 @@ +# coding=utf-8 +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test for compiler_opt.tools.make_corpus_lib""" + +import json +import os + +from absl.testing import absltest + +from compiler_opt.tools import make_corpus_lib + + +class MakeCorpusTest(absltest.TestCase): + + def test_load_bitcode_from_directory(self): + outer = self.create_tempdir() + tempdir = outer.mkdir(dir_path='nested') + tempdir.create_file('test1.bc') + tempdir.create_file('test2.bc') + relative_paths = make_corpus_lib.load_bitcode_from_directory(outer) + relative_paths = sorted(relative_paths) + self.assertEqual(relative_paths[0], 'nested/test1') + self.assertEqual(relative_paths[1], 'nested/test2') + + def test_copy_bitcode(self): + build_dir = self.create_tempdir() + nested_dir = build_dir.mkdir(dir_path='nested') + nested_dir.create_file('test1.bc') + nested_dir.create_file('test2.bc') + relative_paths = ['nested/test1', 'nested/test2'] + corpus_dir = self.create_tempdir() + make_corpus_lib.copy_bitcode(relative_paths, build_dir, corpus_dir) + output_files = sorted(os.listdir(os.path.join(corpus_dir, './nested'))) + self.assertEqual(output_files[0], 'test1.bc') + self.assertEqual(output_files[1], 'test2.bc') + + def test_write_corpus_manifest(self): + relative_output_paths = ['test/test1', 'test/test2'] + output_dir = self.create_tempdir() + default_args = ['-O3', '-c'] + make_corpus_lib.write_corpus_manifest(relative_output_paths, output_dir, + default_args) + with open( + os.path.join(output_dir, 'corpus_description.json'), + encoding='utf-8') as corpus_description_file: + corpus_description = json.load(corpus_description_file) + self.assertEqual(corpus_description['global_command_override'], + default_args) + self.assertEqual(corpus_description['has_thinlto'], False) + self.assertEqual(corpus_description['modules'], relative_output_paths) + + +if __name__ == '__main__': + absltest.main() >From 3f8d1e7052734979806d94cccfde5a8a05f6dece Mon Sep 17 00:00:00 2001 From: Aiden Grossman <agrossman...@yahoo.com> Date: Sun, 14 Jan 2024 21:14:47 -0800 Subject: [PATCH 2/2] Add proper copyright headers --- llvm/py/src/mlgo/combine_training_corpus.py | 17 +++-------------- llvm/py/src/mlgo/combine_training_corpus_lib.py | 17 +++-------------- .../py/src/mlgo/combine_training_corpus_test.py | 17 +++-------------- llvm/py/src/mlgo/extract_ir.py | 17 +++-------------- llvm/py/src/mlgo/extract_ir_lib.py | 17 +++-------------- llvm/py/src/mlgo/extract_ir_test.py | 17 +++-------------- llvm/py/src/mlgo/make_corpus.py | 17 +++-------------- llvm/py/src/mlgo/make_corpus_lib.py | 17 +++-------------- llvm/py/src/mlgo/make_corpus_test.py | 17 +++-------------- 9 files changed, 27 insertions(+), 126 deletions(-) diff --git a/llvm/py/src/mlgo/combine_training_corpus.py b/llvm/py/src/mlgo/combine_training_corpus.py index 94ee1cbac9cea4..e62bcb61e9d9e1 100644 --- a/llvm/py/src/mlgo/combine_training_corpus.py +++ b/llvm/py/src/mlgo/combine_training_corpus.py @@ -1,17 +1,6 @@ -# coding=utf-8 -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception r"""Combine multiple training corpus into a single training corpus. Currently only support the case that multiple corpus share the same diff --git a/llvm/py/src/mlgo/combine_training_corpus_lib.py b/llvm/py/src/mlgo/combine_training_corpus_lib.py index 0359961266a240..1050e5099ae21c 100644 --- a/llvm/py/src/mlgo/combine_training_corpus_lib.py +++ b/llvm/py/src/mlgo/combine_training_corpus_lib.py @@ -1,17 +1,6 @@ -# coding=utf-8 -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """Library for combining training corpora.""" import os diff --git a/llvm/py/src/mlgo/combine_training_corpus_test.py b/llvm/py/src/mlgo/combine_training_corpus_test.py index 47dd602967b68f..3c793947db139e 100644 --- a/llvm/py/src/mlgo/combine_training_corpus_test.py +++ b/llvm/py/src/mlgo/combine_training_corpus_test.py @@ -1,17 +1,6 @@ -# coding=utf-8 -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """Tests for combining training corpora.""" import json diff --git a/llvm/py/src/mlgo/extract_ir.py b/llvm/py/src/mlgo/extract_ir.py index 2a1ef3978888d6..58e31a0475e124 100644 --- a/llvm/py/src/mlgo/extract_ir.py +++ b/llvm/py/src/mlgo/extract_ir.py @@ -1,17 +1,6 @@ -# coding=utf-8 -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """Extract IR for training. Extract IR for training, either from a compile_commands.json file produced by diff --git a/llvm/py/src/mlgo/extract_ir_lib.py b/llvm/py/src/mlgo/extract_ir_lib.py index c1d2a54b9a9e7c..83d2b26d1f71ce 100644 --- a/llvm/py/src/mlgo/extract_ir_lib.py +++ b/llvm/py/src/mlgo/extract_ir_lib.py @@ -1,17 +1,6 @@ -# coding=utf-8 -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """Library functions for IR extraction.""" import os diff --git a/llvm/py/src/mlgo/extract_ir_test.py b/llvm/py/src/mlgo/extract_ir_test.py index 8811134aab4fce..d7de50530032cc 100644 --- a/llvm/py/src/mlgo/extract_ir_test.py +++ b/llvm/py/src/mlgo/extract_ir_test.py @@ -1,17 +1,6 @@ -# coding=utf-8 -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """Tests for compiler_opt.tools.extract_ir.""" # pylint: disable=protected-access diff --git a/llvm/py/src/mlgo/make_corpus.py b/llvm/py/src/mlgo/make_corpus.py index 24493d894be723..989d9790b5bcd9 100644 --- a/llvm/py/src/mlgo/make_corpus.py +++ b/llvm/py/src/mlgo/make_corpus.py @@ -1,17 +1,6 @@ -# coding=utf-8 -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """Tool for making a corpus from arbitrary bitcode. To create a corpus from a set of bitcode files in an input directory, run diff --git a/llvm/py/src/mlgo/make_corpus_lib.py b/llvm/py/src/mlgo/make_corpus_lib.py index 3598fc12a04d14..97db20a9859e17 100644 --- a/llvm/py/src/mlgo/make_corpus_lib.py +++ b/llvm/py/src/mlgo/make_corpus_lib.py @@ -1,17 +1,6 @@ -# coding=utf-8 -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """Library functions for making a corpus from arbitrary bitcode.""" import pathlib diff --git a/llvm/py/src/mlgo/make_corpus_test.py b/llvm/py/src/mlgo/make_corpus_test.py index 8ed598695d06ee..fcb861ebb91f32 100644 --- a/llvm/py/src/mlgo/make_corpus_test.py +++ b/llvm/py/src/mlgo/make_corpus_test.py @@ -1,17 +1,6 @@ -# coding=utf-8 -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """Test for compiler_opt.tools.make_corpus_lib""" import json _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits