fix:Improve log warnings in REST API /health endpoint (#5381)

* Improve warning in REST APIs get_health_status method * Convert log message * A better solution and documentation * Add another nested try/except block * Simplify
deepset-ai · Jul 25, 2023 · 22897c1 · 22897c1
1 parent 5bb0a1f
commit 22897c1
Showing 1 changed file with 55 additions and 36 deletions.
diff --git a/rest_api/rest_api/controller/health.py b/rest_api/rest_api/controller/health.py
@@ -64,47 +64,66 @@ class HealthResponse(BaseModel):
     gpus: List[GPUInfo] = Field(default_factory=list, description="GPU usage details")
 
 
-@router.get("/health", response_model=HealthResponse, status_code=200)
-def get_health_status():
-    """
-    This endpoint allows external systems to monitor the health of the Haystack REST API.
-    """
+def get_cpu_usage() -> CPUUsage:
+    cpu_count = os.cpu_count() or 1
+    p = psutil.Process()
+    p_cpu_usage = p.cpu_percent() / cpu_count
+    return CPUUsage(used=p_cpu_usage)
 
-    gpus: List[GPUInfo] = []
 
+def get_memory_usage() -> MemoryUsage:
+    p = psutil.Process()
+    p_memory_usage = p.memory_percent()
+    return MemoryUsage(used=p_memory_usage)
+
+
+def get_gpu_usage() -> List[GPUInfo]:
+    gpus: List[GPUInfo] = []
     try:
         pynvml.nvmlInit()
-        gpu_count = pynvml.nvmlDeviceGetCount()
-        for i in range(gpu_count):
-            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-            gpu_mem_total = float(info.total) / 1024 / 1024
-            gpu_mem_used = None
-            for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
-                if proc.pid == os.getpid():
-                    gpu_mem_used = float(proc.usedGpuMemory) / 1024 / 1024
-                    break
-            gpu_info = GPUInfo(
-                index=i,
-                usage=GPUUsage(
-                    memory_total=round(gpu_mem_total),
-                    kernel_usage=pynvml.nvmlDeviceGetUtilizationRates(handle).gpu,
-                    memory_used=round(gpu_mem_used) if gpu_mem_used is not None else None,
-                ),
-            )
-
-            gpus.append(gpu_info)
+        try:
+            gpu_count = pynvml.nvmlDeviceGetCount()
+            for i in range(gpu_count):
+                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+                info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+                gpu_mem_total = float(info.total) / 1024 / 1024
+                gpu_mem_used = None
+                try:
+                    for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
+                        if proc.pid == os.getpid():
+                            gpu_mem_used = float(proc.usedGpuMemory) / 1024 / 1024
+                            break
+                except pynvml.NVMLError:
+                    # ignore if nvmlDeviceGetComputeRunningProcesses is not supported
+                    # this can happen for outdated drivers
+                    pass
+                gpu_info = GPUInfo(
+                    index=i,
+                    usage=GPUUsage(
+                        memory_total=round(gpu_mem_total),
+                        kernel_usage=pynvml.nvmlDeviceGetUtilizationRates(handle).gpu,
+                        memory_used=round(gpu_mem_used) if gpu_mem_used is not None else None,
+                    ),
+                )
+                gpus.append(gpu_info)
+        except pynvml.NVMLError as e:
+            logger.warning("Couldn't collect GPU stats: %s", str(e))
+        finally:
+            pynvml.nvmlShutdown()
     except pynvml.NVMLError:
-        logger.warning("No NVIDIA GPU found.")
+        # Here we intentionally ignore errors that occur when NVML (NVIDIA Management Library) is not available
+        # or found. See the original code's comment for more details.
+        pass
 
-    p_cpu_usage = 0
-    p_memory_usage = 0
-    cpu_count = os.cpu_count() or 1
-    p = psutil.Process()
-    p_cpu_usage = p.cpu_percent() / cpu_count
-    p_memory_usage = p.memory_percent()
+    return gpus
 
-    cpu_usage = CPUUsage(used=p_cpu_usage)
-    memory_usage = MemoryUsage(used=p_memory_usage)
 
-    return HealthResponse(version=haystack.__version__, cpu=cpu_usage, memory=memory_usage, gpus=gpus)
+@router.get("/health", response_model=HealthResponse, status_code=200)
+def get_health_status():
+    """
+    This endpoint allows external systems to monitor the health of the Haystack REST API.
+    """
+
+    return HealthResponse(
+        version=haystack.__version__, cpu=get_cpu_usage(), memory=get_memory_usage(), gpus=get_gpu_usage()
+    )