Skip to content

Commit

Permalink
fix:Improve log warnings in REST API /health endpoint (#5381)
Browse files Browse the repository at this point in the history
* Improve warning in REST APIs get_health_status method

* Convert log message

* A better solution and documentation

* Add another nested try/except block

* Simplify
  • Loading branch information
vblagoje authored Jul 25, 2023
1 parent 5bb0a1f commit 22897c1
Showing 1 changed file with 55 additions and 36 deletions.
91 changes: 55 additions & 36 deletions rest_api/rest_api/controller/health.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,47 +64,66 @@ class HealthResponse(BaseModel):
gpus: List[GPUInfo] = Field(default_factory=list, description="GPU usage details")


@router.get("/health", response_model=HealthResponse, status_code=200)
def get_health_status():
"""
This endpoint allows external systems to monitor the health of the Haystack REST API.
"""
def get_cpu_usage() -> CPUUsage:
cpu_count = os.cpu_count() or 1
p = psutil.Process()
p_cpu_usage = p.cpu_percent() / cpu_count
return CPUUsage(used=p_cpu_usage)

gpus: List[GPUInfo] = []

def get_memory_usage() -> MemoryUsage:
p = psutil.Process()
p_memory_usage = p.memory_percent()
return MemoryUsage(used=p_memory_usage)


def get_gpu_usage() -> List[GPUInfo]:
gpus: List[GPUInfo] = []
try:
pynvml.nvmlInit()
gpu_count = pynvml.nvmlDeviceGetCount()
for i in range(gpu_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
gpu_mem_total = float(info.total) / 1024 / 1024
gpu_mem_used = None
for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
if proc.pid == os.getpid():
gpu_mem_used = float(proc.usedGpuMemory) / 1024 / 1024
break
gpu_info = GPUInfo(
index=i,
usage=GPUUsage(
memory_total=round(gpu_mem_total),
kernel_usage=pynvml.nvmlDeviceGetUtilizationRates(handle).gpu,
memory_used=round(gpu_mem_used) if gpu_mem_used is not None else None,
),
)

gpus.append(gpu_info)
try:
gpu_count = pynvml.nvmlDeviceGetCount()
for i in range(gpu_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
gpu_mem_total = float(info.total) / 1024 / 1024
gpu_mem_used = None
try:
for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
if proc.pid == os.getpid():
gpu_mem_used = float(proc.usedGpuMemory) / 1024 / 1024
break
except pynvml.NVMLError:
# ignore if nvmlDeviceGetComputeRunningProcesses is not supported
# this can happen for outdated drivers
pass
gpu_info = GPUInfo(
index=i,
usage=GPUUsage(
memory_total=round(gpu_mem_total),
kernel_usage=pynvml.nvmlDeviceGetUtilizationRates(handle).gpu,
memory_used=round(gpu_mem_used) if gpu_mem_used is not None else None,
),
)
gpus.append(gpu_info)
except pynvml.NVMLError as e:
logger.warning("Couldn't collect GPU stats: %s", str(e))
finally:
pynvml.nvmlShutdown()
except pynvml.NVMLError:
logger.warning("No NVIDIA GPU found.")
# Here we intentionally ignore errors that occur when NVML (NVIDIA Management Library) is not available
# or found. See the original code's comment for more details.
pass

p_cpu_usage = 0
p_memory_usage = 0
cpu_count = os.cpu_count() or 1
p = psutil.Process()
p_cpu_usage = p.cpu_percent() / cpu_count
p_memory_usage = p.memory_percent()
return gpus

cpu_usage = CPUUsage(used=p_cpu_usage)
memory_usage = MemoryUsage(used=p_memory_usage)

return HealthResponse(version=haystack.__version__, cpu=cpu_usage, memory=memory_usage, gpus=gpus)
@router.get("/health", response_model=HealthResponse, status_code=200)
def get_health_status():
"""
This endpoint allows external systems to monitor the health of the Haystack REST API.
"""

return HealthResponse(
version=haystack.__version__, cpu=get_cpu_usage(), memory=get_memory_usage(), gpus=get_gpu_usage()
)

0 comments on commit 22897c1

Please sign in to comment.