Add options to support Profile Guided Optimization

solotzg · Jun 30, 2022 · b35b586 · b35b586
1 parent 5b61ae7
commit b35b586
Show file tree

Hide file tree

Showing 6 changed files with 285 additions and 3 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -134,6 +134,48 @@ if (COMPILER_CLANG)
     endif ()
 endif ()
 
+option (ENABLE_LLVM_PROFILE_INSTR "Generate instrumented code to collect execution counts" OFF)
+option (ENABLE_LLVM_PGO "Enables flags for Profile Guided Optimization (PGO)" OFF)
+option (ENABLE_LLVM_PGO_USE_SAMPLE "Enables flags for Profile Guided Optimization (PGO) and use sampling profilers" OFF)
+set (USE_LLVM_FDO OFF CACHE BOOL "" FORCE)
+
+if (ENABLE_LLVM_PGO)
+    if (ENABLE_LLVM_PROFILE_INSTR)
+        message (FATAL_ERROR "`ENABLE_LLVM_PROFILE_INSTR` can not be used with `ENABLE_LLVM_PGO`")
+    endif ()
+    if (ENABLE_LLVM_PGO_USE_SAMPLE)
+
+        # Follow https://clang.llvm.org/docs/UsersManual.html#using-sampling-profilers
+        # Use https://github.com/google/autofdo
+
+        set (_LLVM_PGO_USE_SAMPLE_FLAGS "-gline-tables-only -fdebug-info-for-profiling -funique-internal-linkage-names")
+
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_LLVM_PGO_USE_SAMPLE_FLAGS}")
+        set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${_LLVM_PGO_USE_SAMPLE_FLAGS}")
+        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--no-rosegment")
+        message (STATUS "Add flags `${_LLVM_PGO_USE_SAMPLE_FLAGS}` for profiling")
+
+        if (NOT "$ENV{TIFLASH_LLVM_PROFDATA}" STREQUAL "")
+            set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-sample-use=$ENV{TIFLASH_LLVM_PROFDATA}")
+            set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-sample-use=$ENV{TIFLASH_LLVM_PROFDATA}")
+            message (STATUS "Use sample profile data `$ENV{TIFLASH_LLVM_PROFDATA}` for profile-guided optimization")
+            set (USE_LLVM_FDO ON CACHE BOOL "" FORCE)
+        else ()
+            message (STATUS "NOT use sample profile data")
+        endif ()
+
+        unset (_LLVM_PGO_USE_SAMPLE_FLAGS)
+    else ()
+        if ("$ENV{TIFLASH_LLVM_PROFDATA}" STREQUAL "")
+            message (FATAL_ERROR "Please set env var `TIFLASH_LLVM_PROFDATA`")
+        endif ()
+
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-use=$ENV{TIFLASH_LLVM_PROFDATA} -Wno-profile-instr-unprofiled")
+        set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-instr-use=$ENV{TIFLASH_LLVM_PROFDATA} -Wno-profile-instr-unprofiled")
+        message (STATUS "Use instrumentation data `$ENV{TIFLASH_LLVM_PROFDATA}` for profile-guided optimization")
+    endif ()
+endif ()
+
 if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
     # clang: warning: argument unused during compilation: '-stdlib=libc++'
     # clang: warning: argument unused during compilation: '-specs=/usr/share/dpkg/no-pie-compile.specs' [-Wunused-command-line-argument]
@@ -448,6 +490,16 @@ if (TEST_LLVM_COVERAGE AND CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG")
     set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-instr-generate -fcoverage-mapping -DTIFLASH_LLVM_COVERAGE=1")
 endif ()
 
+# `ENABLE_LLVM_PROFILE_INSTR` will make executable binary generate profile data automatically. Make it only work at modules dbms and libs.
+if (ENABLE_LLVM_PROFILE_INSTR)
+    if (ENABLE_LLVM_PGO)
+        message (FATAL_ERROR "`ENABLE_LLVM_PROFILE_INSTR` can not be used with `ENABLE_LLVM_PGO`")
+    endif ()
+    message (STATUS "Using flag `-fprofile-instr-generate`. Generate instrumented code to collect execution counts into default.profraw file(overridden by '=' form of option or `LLVM_PROFILE_FILE` env var). Follow https://clang.llvm.org/docs/UsersManual.html#profile-guided-optimization.")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate")
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-instr-generate")
+endif ()
+
 if (ARCH_AMD64)
     include(CheckCXXCompilerFlag)
     check_cxx_compiler_flag("-mvpclmulqdq -Werror -Wall -Wextra" TIFLASH_COMPILER_VPCLMULQDQ_SUPPORT)

diff --git a/dbms/src/Common/TiFlashBuildInfo.cpp b/dbms/src/Common/TiFlashBuildInfo.cpp
@@ -101,6 +101,23 @@ std::string getEnabledFeatures()
 #if ENABLE_THINLTO
             "thinlto",
 #endif
+
+// Profile instrumentation
+#if ENABLE_LLVM_PROFILE_INSTR
+            "profile-instr",
+#endif
+
+// PGO
+#if ENABLE_LLVM_PGO_USE_SAMPLE
+            "pgo-sample",
+#elif ENABLE_LLVM_PGO
+            "pgo-instr",
+#endif
+
+// FDO
+#if USE_LLVM_FDO
+            "fdo",
+#endif
     };
     return fmt::format("{}", fmt::join(features.begin(), features.end(), " "));
 }

diff --git a/format-diff.py b/format-diff.py
@@ -96,9 +96,6 @@ def main():
             else:
                 print("Format check passed")
         else:
-            cmd = 'clang-format -i {}'.format(' '.join(files_to_format))
-            if subprocess.Popen(cmd, shell=True, cwd=tics_repo_path).wait():
-                exit(-1)
             print("Finish code format")
     else:
         print('No file to format')

diff --git a/libs/libcommon/include/common/config_common.h.in b/libs/libcommon/include/common/config_common.h.in
@@ -14,3 +14,7 @@
 #cmakedefine01 USE_UNWIND
 #cmakedefine01 USE_LLVM_LIBUNWIND
 #cmakedefine01 ENABLE_THINLTO
+#cmakedefine01 ENABLE_LLVM_PGO
+#cmakedefine01 ENABLE_LLVM_PROFILE_INSTR
+#cmakedefine01 ENABLE_LLVM_PGO_USE_SAMPLE
+#cmakedefine01 USE_LLVM_FDO
diff --git a/release-centos7-llvm/env/prepare-sysroot.sh b/release-centos7-llvm/env/prepare-sysroot.sh
@@ -37,6 +37,7 @@ function install_llvm() {
     mkdir -p llvm-project/build
     cd llvm-project/build
 
+    # TODO: enable `bolt` for >= 14.0.0. https://github.com/llvm/llvm-project/tree/main/bolt
     cmake -DCMAKE_BUILD_TYPE=Release \
         -GNinja \
         -DLLVM_ENABLE_PROJECTS="clang;lld;polly;clang-tools-extra" \

diff --git a/release-centos7-llvm/scripts/perf-tpch.py b/release-centos7-llvm/scripts/perf-tpch.py
@@ -0,0 +1,211 @@
+#!/usr/bin/python3
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import signal
+import sys
+import time
+import logging
+import types
+import subprocess
+
+logger = None
+
+
+def get_tz_offset():
+    import datetime
+    now_stamp = time.time()
+    local_time = datetime.datetime.fromtimestamp(now_stamp)
+    utc_time = datetime.datetime.utcfromtimestamp(now_stamp)
+    offset = local_time - utc_time
+    total_seconds = offset.total_seconds()
+    flag = '+'
+    if total_seconds < 0:
+        flag = '-'
+        total_seconds = -total_seconds
+    mm, ss = divmod(total_seconds, 60)
+    hh, mm = divmod(mm, 60)
+    tz_offset = "%s%02d:%02d" % (flag, hh, mm)
+    return tz_offset
+
+
+def init_logger():
+    global logger
+
+    tz_offset = get_tz_offset()
+
+    orig_record_factory = logging.getLogRecordFactory()
+    log_colors = {
+        logging.DEBUG: "\033[1;34m",  # blue
+        logging.INFO: "\033[1;32m",  # green
+        logging.WARNING: "\033[1;35m",  # magenta
+        logging.ERROR: "\033[1;31m",  # red
+        logging.CRITICAL: "\033[1;41m",  # red reverted
+    }
+
+    def get_message(ori):
+        msg = str(ori.msg)
+        if ori.args:
+            msg = msg % ori.args
+        msg = "{}{}{}".format(log_colors[ori.levelno], msg, "\033[0m")
+        return msg
+
+    def record_factory(*args, **kwargs):
+        record = orig_record_factory(*args, **kwargs)
+        record.getMessage = types.MethodType(get_message, record)
+        return record
+
+    logging.setLogRecordFactory(record_factory)
+
+    root = logging.getLogger()
+    root.setLevel(logging.DEBUG)
+    handler = logging.StreamHandler(sys.stdout)
+    handler.setLevel(logging.DEBUG)
+    handler.setFormatter(
+        fmt=logging.Formatter('[%(asctime)s.%(msecs)03d {}][%(levelname)s][%(message)s]'.format(tz_offset),
+                              datefmt='%Y/%m/%d %H:%M:%S'))
+    root.addHandler(handler)
+    logger = root
+
+
+init_logger()
+
+
+def wrap_run_time(func):
+    def wrap_func(*args, **kwargs):
+        bg = time.time()
+        r = func(*args, **kwargs)
+        logger.debug('Time cost {:.3f}s'.format(time.time() - bg))
+        return r
+
+    return wrap_func
+
+
+@wrap_run_time
+def run_cmd(cmd):
+    logger.debug("RUN CMD:\n{}\n".format(' '.join(cmd)))
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE)
+    stdout, stderr = proc.communicate()
+    # stderr.decode('utf-8')
+    return stdout, stderr, proc.returncode
+
+
+class Runner:
+    def __init__(self):
+        usage = """
+1. compile TiFlash with cmake option `-DENABLE_LLVM_PGO=ON -DENABLE_LLVM_PGO_USE_SAMPLE=ON`
+2. compile https://github.com/google/autofdo and get binary `create_llvm_prof` for converting perf data to llvm profile data
+3. start TiFlash process and get `<pid>`
+4. prepare workload scripts file
+5. run `python3 perf-tpch.py --perf --pid <pid> --workload <workload-scripts-path> --convert-llvm --convert-tool <create_llvm_prof-path> --binary <tiflash-bianry-path>`
+6. get llvm perf file(`tiflash.llvm.code.prof` by default)
+7. compile TiFlash with env `TIFLASH_LLVM_PROFDATA=<output-llvm-prof>` and cmake option `-DENABLE_LLVM_PGO=ON -DENABLE_LLVM_PGO_USE_SAMPLE=ON`
+8. re-run workload and compare result
+"""
+        parser = argparse.ArgumentParser(
+            description="Auto FDO tools", formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+            usage=usage)
+        parser.add_argument(
+            '--perf', help='run perf with workload', action='store_true')
+        parser.add_argument(
+            '--convert-llvm', help='convert linux perf data to llvm profile data', action='store_true')
+
+        parser.add_argument(
+            '--workload', help='absolute path of workload script', required=False)
+        parser.add_argument(
+            '--pid', help='pid of TiFlash process', required=False)
+        parser.add_argument(
+            '--output', help='output file of perf data', required=False)
+        parser.add_argument(
+            '--convert-tool', help='tool to conver linux perf data to llvm profile data',)
+        parser.add_argument(
+            '--input-perf-file', help='input linux perf data file path')
+        parser.add_argument(
+            '--binary', help='binary to run workload')
+        parser.add_argument(
+            '--output-llvm-prof', help='output llvm profile data path', default='tiflash.llvm.code.prof')
+        self.args = parser.parse_args()
+        self.linux_perf_data = None
+
+    def run(self):
+        if self.args.perf:
+            self.run_perf()
+        if self.args.convert_llvm:
+            self.convert_llvm_perf()
+
+    def convert_llvm_perf(self):
+        assert self.args.convert_tool
+        if self.linux_perf_data is None:
+            assert self.args.input_perf_file
+        else:
+            self.args.input_perf_file = self.linux_perf_data
+
+        self.args.output_llvm_prof = 'tiflash.llvm.code.prof'
+
+        assert self.args.binary
+        logger.info('start to convert linux perf data `{}` to llvm profile data `{}`'.format(
+            self.args.input_perf_file, self.args.output_llvm_prof))
+        stdout, stderr, e = run_cmd([self.args.convert_tool, '--profile', '{}'.format(self.args.input_perf_file),
+                                     '--binary', "{}".format(self.args.binary),
+                                     '--out', '{}'.format(self.args.output_llvm_prof)])
+        logger.info(
+            'finish convert. stdout `{}`, stderr `{}`'.format(stdout.decode('utf-8'), stderr.decode('utf-8')))
+        assert e == 0
+
+    def run_perf(self):
+        assert self.args.pid
+        assert self.args.workload
+
+        pid = self.args.pid
+        output = 'tiflash.perf.data' if self.args.output is None else self.args.output
+        logger.info('using output file `{}`'.format(output))
+
+        def workload():
+            # git clone [email protected]:pingcap/go-tpc.git
+            # cd go-tpc
+            # make build
+            # bin/go-tpc tpch run --queries q1 --host {} -P {} --db {} --count 1
+            logger.info('start to run workload `{}`'.format(
+                self.args.workload))
+            stdout, stderr, err = run_cmd([self.args.workload])
+            logger.info('finish workload `{}`. stdout `{}`, stderr `{}`'.format(
+                self.args.workload, stdout.decode('utf-8'), stderr.decode('utf-8')))
+            assert err == 0
+        perf_cmd = ["perf", "record", "-p", "{}".format(
+            pid), "-e", "cycles:up", "-j", "any,u", "-a", "-o", "{}".format(output)]
+        logger.info("start perf with cmd `{}`".format(' '.join(perf_cmd)))
+        perf_proc = subprocess.Popen(
+            perf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        #
+        workload()
+        #
+        perf_proc.send_signal(signal.SIGTERM)
+        stdout, stderr = perf_proc.communicate()
+        logger.info(
+            "stop perf. stdout `{}`, stderr `{}`".format(stdout.decode('utf-8'), stderr.decode('utf-8')))
+        _ = perf_proc.wait()
+        # check file exits
+        with open(output, 'r') as f:
+            f.close()
+        self.linux_perf_data = output
+
+
+def main():
+    Runner().run()
+
+
+if __name__ == '__main__':
+    main()