Merge pull request #98 from bytedance/gyj/add_version

Add Version and Execution Date info to reports
bytedance · Aug 26, 2024 · 7c49815 · 7c49815
2 parents b5ac619 + babd3aa
commit 7c49815
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 22 deletions.
diff --git a/VERSION b/VERSION
@@ -0,0 +1,3 @@
+major=1
+minor=0
+patch=0
diff --git a/byte_infer_perf/general_perf/core/perf_engine.py b/byte_infer_perf/general_perf/core/perf_engine.py
@@ -18,6 +18,7 @@
 import json
 import subprocess
 import time
+import traceback
 
 from typing import Any, Dict, Tuple
 import virtualenv
@@ -70,6 +71,19 @@ def __init__(self) -> None:
         self.prev_sys_path = list(sys.path)
         self.real_prefix = sys.prefix
         self.compile_only_mode = False
+        self.version = self.get_version()
+
+    def get_version(self):
+        version = ""
+        try:
+            version_file = os.path.join(str(BYTE_MLPERF_ROOT), "../VERSION")
+            with open(version_file) as f:
+                _version = f.read().splitlines()
+            version = '.'.join(v.split('=')[1] for v in _version)
+        except Exception as e:
+            traceback.print_exc()
+            log.warning(f"get bytemlperf version failed, error msg: {e}")
+        return version
 
     def start_engine(self) -> None:
         '''
@@ -168,6 +182,9 @@ def single_workload_perf(
             base_report.pop("Backend")
             return compile_info["compile_status"], base_report
 
+        base_report["Version"] = self.version
+        base_report["Execution Date"] = time.strftime("%Y-%m-%d %H:%M:%S")
+
         # load runtime backend
         """
         Start Here

diff --git a/byte_infer_perf/llm_perf/launch.py b/byte_infer_perf/llm_perf/launch.py
@@ -21,6 +21,7 @@
 import multiprocessing as mp
 import signal
 from typing import Any, Dict, Iterable, List
+import traceback
 
 # ${prj_root}/
 BYTE_MLPERF_ROOT = pathlib.Path(__file__).parents[1]
@@ -45,12 +46,26 @@ def __init__(self, hardware, task, host, port) -> None:
         self.result_queue = mp.Queue()
         self.jobs: List[mp.Process] = []
         self.server_process = None
+        self.version = self.get_version()
 
 
     def __del__(self):
         self.stop_server()
 
 
+    def get_version(self):
+        version = ""
+        try:
+            version_file = os.path.join(str(BYTE_MLPERF_ROOT), "../VERSION")
+            with open(version_file) as f:
+                _version = f.read().splitlines()
+            version = '.'.join(v.split('=')[1] for v in _version)
+        except Exception as e:
+            traceback.print_exc()
+            logger.warning(f"get bytemlperf version failed, error msg: {e}")
+        return version
+
+
     def start_engine(self) -> None:
         # load workload
         workload = load_workload(self.task)
@@ -85,6 +100,8 @@ def start_engine(self) -> None:
 
             test_perf=test_perf,
             test_accuracy=test_accuracy,
+
+            version=self.version,
         )
         self.reporter.start()
 

diff --git a/byte_infer_perf/llm_perf/utils/reporter.py b/byte_infer_perf/llm_perf/utils/reporter.py
@@ -68,6 +68,7 @@ def __init__(
         max_new_tokens: int,
         test_perf: bool,
         test_accuracy: bool,
+        version: str="",
     ) -> None:
         self._running: bool = False
         self.cond: threading.Condition = threading.Condition()
@@ -87,12 +88,15 @@ def __init__(
         self.tp_size = tp_size
         self.batch_size = batch_size
         self.input_tokens = input_tokens
+        self.version = version
 
         # result template
         self.result: Dict[str, Any] = {
             "Model": self.task,
             "Backend": self.backend,
             "Host Info": get_cpu_name(),
+            "Version": self.version,
+            "Execution Date": time.strftime("%Y-%m-%d %H:%M:%S"),
             "Min New Tokens": min_new_tokens,
             "Max New Tokens": max_new_tokens,
             "Accuracy": {"PPL": [], "Token Diff": {}, "Logits Diff": {}},

diff --git a/byte_micro_perf/backends/GPU/backend_gpu.py b/byte_micro_perf/backends/GPU/backend_gpu.py
@@ -35,7 +35,6 @@
 
 
 class BackendGPU(Backend):
-
     def get_device_count(self):
         return torch.cuda.device_count()
 

diff --git a/byte_micro_perf/core/perf_engine.py b/byte_micro_perf/core/perf_engine.py
@@ -212,16 +212,26 @@ def __init__(self) -> None:
         self.old_os_path = os.environ["PATH"]
         self.prev_sys_path = list(sys.path)
         self.real_prefix = sys.prefix
+        self.version = self.get_version()
+
+    def get_version(self):
+        version = ""
+        try:
+            version_file = os.path.join(str(BYTE_MLPERF_ROOT), "../VERSION")
+            with open(version_file) as f:
+                _version = f.read().splitlines()
+            version = '.'.join(v.split('=')[1] for v in _version)
+        except Exception as e:
+            traceback.print_exc()
+            log.warning(f"get bytemlperf version failed, error msg: {e}")
+        return version
 
     def get_cpu_name(self):
         command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
         cpu_name = subprocess.check_output(command, shell=True)
         return cpu_name.decode().strip()
 
-
-
     def start_engine(self) -> None:
-
         if self.args.activate_venv:
             self.activate_venv(self.backend_type)
 
@@ -270,7 +280,6 @@ def start_engine(self) -> None:
             for shape in shape_list:
                 test_list.append(ConfigInstance(dtype, shape, case_index))
                 case_index = case_index + 1
-
 
         try:
             mp.set_start_method("spawn", force=True)
@@ -287,10 +296,6 @@ def start_engine(self) -> None:
             if self.workload["operator"] in ["device2host", "host2device"]:
                 instance_num = 1
 
-
-
-
-
             input_queues = mp.Queue()
             output_queues = mp.Queue(maxsize=1)
 
@@ -308,16 +313,13 @@ def start_engine(self) -> None:
                     assert "ready" == output_queues.get()
                 log.info("all ranks are ready and listening, init done")
 
-
-
                 if group == 1:
                     for test_instance in test_list:
                         input_queues.put(test_instance, True)
 
                     for _ in range(instance_num):
                         input_queues.put("end", True)
 
-
                 for process in _subprocesses.processes:
                     process.join()
 
@@ -330,9 +332,6 @@ def start_engine(self) -> None:
         if self.args.activate_venv:
             self.deactivate_venv()
 
-
-
-
     def perf_func(self, rank: int, *args):
         backend_instance = self.backend_class(self.workload, self.args.vendor_path)
         op_name = self.workload["operator"]
@@ -342,7 +341,6 @@ def perf_func(self, rank: int, *args):
         # set device accroding to local_rank
         set_device_func = getattr(backend_instance, "set_device")
         set_device_func(rank)
-
 
         if world_size > 1:
             init_ccl_func = getattr(backend_instance, "initialize_ccl")
@@ -354,7 +352,6 @@ def perf_func(self, rank: int, *args):
         else:
             raise ValueError(f"Unknown operation: {op_name.lower()}")
 
-
         output_queues.put("ready")
 
         result_list = []
@@ -396,7 +393,6 @@ def perf_func(self, rank: int, *args):
 
             result_list = sorted(output_result_list, key=lambda x: x.config.index)
 
-
         elif group_size > 1:
             for i, test_instance in enumerate(test_list):
                 if rank == 0:
@@ -421,7 +417,6 @@ def perf_func(self, rank: int, *args):
 
                 result_list.append(ResultItem(test_instance, reports))
 
-
         if rank == 0:
             print(f"{len(result_list)} tasks finished.")
 
@@ -439,9 +434,11 @@ def perf_func(self, rank: int, *args):
                     "Backend": self.backend_type,
                     "Host Info": self.get_cpu_name(),
                     "Device Info": getattr(self.backend, "get_device_name")(),
+                    "Version": self.version,
+                    "Execution Date": time.strftime("%Y-%m-%d %H:%M:%S"),
                     "Performance": [result.report for result in dtype_results_mapping[dtype]]
                 }
-                
+
                 filename = (
                     f"result-{str(dtype)}"
                     + (
@@ -460,8 +457,6 @@ def perf_func(self, rank: int, *args):
             destroy_group_func()
 
         return True
-
-
 
     def activate_venv(self, hardware_type: str) -> bool:
         if os.path.exists("backends/" + hardware_type + "/requirements.txt"):