[v1.0] Add docker metric support (#113)

* stresscli/metrics: Add support to collect service metrics for Docker deployed workload Signed-off-by: Cathy Zhang <[email protected]> * Utilize symbol @ as a delimiter for metric file name Utilize symbol '@' as a delimiter between the service name and pod name within the generaged metric filename to ensure that a given service name does not erroneously correspond to an excessive number of unrelated files. Signed-off-by: Cathy Zhang <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Cathy Zhang <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
opea-project · Sep 6, 2024 · cff0a36 · cff0a36
1 parent 102fcdd
commit cff0a36
Show file tree

Hide file tree

Showing 6 changed files with 160 additions and 20 deletions.
diff --git a/evals/benchmark/benchmark.yaml b/evals/benchmark/benchmark.yaml
@@ -56,8 +56,9 @@ test_cases:
     e2e:
       run_test: true
       service_name: "chatqna-backend-server-svc"  # Replace with your service name
-      service_list:  # Replace with your k8s service names for metrics collection,
-                     # activate if deployment_type is k8s and collect_service_metric is true
+      service_list:  # Replace with your k8s service names if deploy with k8s
+                     # or container names if deploy with Docker for metrics collection,
+                     # activate if collect_service_metric is true
         - "chatqna-tei"
         - "chatqna-teirerank"
 

diff --git a/evals/benchmark/stresscli/commands/load_test.py b/evals/benchmark/stresscli/commands/load_test.py
@@ -66,13 +66,29 @@ def locust_runtests(kubeconfig, profile):
         click.echo(f"Load test results saved to {base_folder}")
 
 
-def collect_metrics(collector, namespace, services, output_dir):
-    collector.start_collecting_data(
-        namespace=namespace,
-        services=services,
-        output_dir=output_dir,
-        restart_pods_flag=False,
-    )
+def collect_metrics(collector, services, output_dir, namespace=None):
+    """Collect metrics from the specified services and output directory.
+
+    Args:
+        collector: The metrics collector object.
+        services (list): A list of services to collect metrics from.
+        output_dir (str): The directory where metrics will be saved.
+        namespace (str, optional): The namespace for collecting metrics. Defaults to None.
+    """
+    if namespace:
+        # If namespace is provided, call with namespace
+        collector.start_collecting_data(
+            namespace=namespace,
+            services=services,
+            output_dir=output_dir,
+            restart_pods_flag=False,
+        )
+    else:
+        # If namespace is not provided, call without namespace
+        collector.start_collecting_data(
+            services=services,
+            output_dir=output_dir,
+        )
 
 
 def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, index):
@@ -154,20 +170,31 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
     print(f"Running test: {' '.join(cmd)}")
     namespace = runspec["namespace"]
 
-    if service_metric and runspec["deployment-type"] == "k8s":
-        from .metrics import MetricsCollector
-        from .metrics_util import export_metric
-
-        collector = MetricsCollector()
+    if service_metric:
         services = global_settings.get("service-list") or []
-        collect_metrics(collector, namespace, services, start_output_folder)
+        if runspec["deployment-type"] == "k8s":
+            from .metrics import MetricsCollector
+
+            collector = MetricsCollector()
+            collect_metrics(collector, services, start_output_folder, namespace)
+        elif runspec["deployment-type"] == "docker":
+            from .metrics_docker import DockerMetricsCollector
+
+            collector = DockerMetricsCollector()
+            collect_metrics(collector, services, start_output_folder)
 
     runspec["starttest_time"] = datetime.now().isoformat()
     result = subprocess.run(cmd, capture_output=True, text=True)
     runspec["endtest_time"] = datetime.now().isoformat()
 
-    if service_metric and runspec["deployment-type"] == "k8s":
-        collect_metrics(collector, namespace, services, end_output_folder)
+    if service_metric:
+        from .metrics_util import export_metric
+
+        services = global_settings.get("service-list") or []
+        if runspec["deployment-type"] == "k8s":
+            collect_metrics(collector, services, end_output_folder, namespace)
+        elif runspec["deployment-type"] == "docker":
+            collect_metrics(collector, services, end_output_folder)
         export_metric(start_output_folder, end_output_folder, metrics_output_folder, metrics_output, services)
 
     with open(json_output, "w") as json_file:

diff --git a/evals/benchmark/stresscli/commands/metrics.py b/evals/benchmark/stresscli/commands/metrics.py
@@ -85,7 +85,7 @@ def start_collecting_data(self, namespace, services, output_dir="/data", restart
                 pod_port = self.get_pod_port(pod_info)
                 metrics = self.collect_metrics(pod_ip, pod_port, metrics_path)
                 if metrics:
-                    pod_output_path = os.path.join(output_dir, f"{service_name}_{pod_name}_{timestamp}.txt")
+                    pod_output_path = os.path.join(output_dir, f"{service_name}@{pod_name}_{timestamp}.txt")
                     logging.debug(f"Writing metrics to {pod_output_path}")
                     with open(pod_output_path, "w") as f:
                         f.write(metrics)

diff --git a/evals/benchmark/stresscli/commands/metrics_docker.py b/evals/benchmark/stresscli/commands/metrics_docker.py
@@ -0,0 +1,111 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import logging
+import os
+import time
+
+import requests
+
+import docker
+
+# Setup logs
+log_level = os.getenv("LOG_LEVEL", "ERROR").upper()
+logging.basicConfig(level=getattr(logging, log_level))
+
+
+class DockerMetricsCollector:
+    def __init__(self):
+        self.docker_client = docker.from_env()
+
+    def get_docker_container(self, container_name):
+        """Retrieve Docker container information."""
+        try:
+            container = self.docker_client.containers.get(container_name)
+            logging.info(f"Found Docker container {container_name}")
+            return container
+        except docker.errors.NotFound:
+            logging.error(f"Container {container_name} not found.")
+            return None
+
+    def get_exposed_port(self, container):
+        """Get the port exposed to the external environment by the Docker container."""
+        try:
+            # Retrieve ports in JSON format
+            ports_json = container.attrs["NetworkSettings"]["Ports"]
+            logging.debug(f"Container ports: {ports_json}")
+
+            # Parse the ports to find the host port
+            for container_port, host_infos in ports_json.items():
+                for host_info in host_infos:
+                    host_ip = host_info["HostIp"]
+                    host_port = host_info["HostPort"]
+
+                    # Use localhost if the port is mapped to 0.0.0.0 or empty
+                    if host_ip in ["0.0.0.0", ""]:
+                        logging.debug(
+                            f"Found host port {host_port} for container port {container_port} (mapped to all interfaces)"
+                        )
+                        return ("localhost", host_port)
+                    else:
+                        logging.debug(
+                            f"Found host port {host_port} for container port {container_port} (mapped to {host_ip})"
+                        )
+                        return (host_ip, host_port)
+
+            logging.error("No valid host port found.")
+            return (None, None)
+        except KeyError as e:
+            logging.error(f"Error retrieving ports: {e}")
+            return (None, None)
+
+    def collect_metrics(self, container_name, metrics_path="/metrics"):
+        """Collect metrics from the Docker container."""
+        container = self.get_docker_container(container_name)
+        if container:
+            try:
+                host_ip, port = self.get_exposed_port(container)  # Get the exposed port
+                if not port:
+                    logging.error(f"Cannot determine port for container {container_name}")
+                    return None
+
+                # Construct the URL
+                service_url = f"http://{host_ip}:{port}{metrics_path}"
+                logging.debug(f"Collecting metrics from {service_url}")
+                response = requests.get(service_url)
+                response.raise_for_status()
+                return response.text
+            except requests.RequestException as e:
+                logging.error(f"Error collecting metrics from {container_name}: {e}")
+        return None
+
+    def start_collecting_data(self, services, output_dir="/data"):
+        """Start collecting metrics from services."""
+        timestamp = int(time.time())
+        for container_name in services:
+            metrics = self.collect_metrics(container_name)
+            if metrics:
+                output_path = os.path.join(output_dir, f"{container_name}@{timestamp}.txt")
+                logging.debug(f"Writing Docker metrics to {output_path}")
+                with open(output_path, "w") as f:
+                    f.write(metrics)
+            else:
+                logging.error(f"No metrics collected for container {container_name}")
+        return {"status": "success"}
+
+
+if __name__ == "__main__":
+    docker_collector = DockerMetricsCollector()
+    result = docker_collector.start_collecting_data(
+        services=[
+            "llm-tgi-server",
+            "retriever-redis-server",
+            "embedding-tei-server",
+            "tei-embedding-server",
+            "tgi-service",
+            "tei-reranking-server",
+        ],
+        output_dir="/path/to/data",
+    )
+    print(result)
diff --git a/evals/benchmark/stresscli/commands/metrics_util.py b/evals/benchmark/stresscli/commands/metrics_util.py
@@ -116,8 +116,8 @@ def calculate_diff(start_dir, end_dir, output_dir, services=None):
         services = [services]
 
     for service_name in services:
-        # Create a regex pattern to match files starting with the service_name followed by a non-alphanumeric character
-        pattern = rf"^{re.escape(service_name)}[^a-zA-Z].*\.txt$"
+        # Create a regex pattern to match files starting with the service_name followed by symbol @
+        pattern = rf"^{re.escape(service_name)}@.*\.txt$"
 
         start_service_files = [f for f in start_files if re.match(pattern, f)]
         end_service_files = [f for f in end_files if re.match(pattern, f)]

diff --git a/evals/benchmark/stresscli/requirements.txt b/evals/benchmark/stresscli/requirements.txt
@@ -1,5 +1,6 @@
 click
 deepdiff
+docker
 flask
 kubernetes
 locust