Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GPU stats to the /stats API and debug screen #3931

Merged
merged 71 commits into from
Nov 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
71 commits
Select commit Hold shift + click to select a range
aadc8a8
Add ffprobe endpoint
NickM-27 Nov 9, 2022
18b0840
Get ffprobe for multiple inputs
NickM-27 Nov 9, 2022
97a56b8
Copy ffprobe in output
NickM-27 Nov 9, 2022
ff45b6d
Fix bad if statement
NickM-27 Nov 10, 2022
f294ec1
Return full output of ffprobe process
NickM-27 Nov 10, 2022
1337c78
Return full output of ffprobe process
NickM-27 Nov 10, 2022
2bf706c
Make ffprobe button show dialog with output and option to copy
NickM-27 Nov 10, 2022
1dd80f4
Add driver names to consts
NickM-27 Sep 24, 2022
fa765da
Add driver env var name
NickM-27 Sep 24, 2022
96b9704
Setup general tracking for GPU stats
NickM-27 Sep 24, 2022
3180dfd
Catch RPi args as well
NickM-27 Sep 24, 2022
497ebf6
Add util to get radeontop results
NickM-27 Sep 24, 2022
38317b9
Add real amd GPU stats
NickM-27 Sep 24, 2022
17c2522
Fix missed arg
NickM-27 Sep 25, 2022
41ac558
pass config
NickM-27 Sep 25, 2022
615135d
Use only the values
NickM-27 Sep 25, 2022
c97975e
Fix vram
NickM-27 Sep 25, 2022
4ffdfe6
Add nvidia gpu stats
NickM-27 Sep 25, 2022
cc15014
Use nvidia stats
NickM-27 Sep 25, 2022
1b35af4
Add chart for gpu stats
NickM-27 Sep 25, 2022
1d7265f
Format AMD with space between percent
NickM-27 Sep 25, 2022
987bc05
Get correct nvidia %
NickM-27 Sep 25, 2022
2e1958a
Start to add support for intel GPU stats
NickM-27 Sep 25, 2022
beb8229
Block out RPi as util is not currently available
NickM-27 Sep 25, 2022
447df28
Formatting
NickM-27 Sep 25, 2022
250440c
Fix mypy
NickM-27 Sep 25, 2022
c7d461c
Strip for float conversion
NickM-27 Sep 25, 2022
f1a8a79
Strip for float conversion
NickM-27 Sep 25, 2022
fa69851
Fix percent formatting
NickM-27 Sep 25, 2022
b0a2822
Remove name from gpu map
NickM-27 Sep 25, 2022
9b5516c
Add tests and fix AMD formatting
NickM-27 Sep 25, 2022
eb25935
Add nvidia gpu stats test
NickM-27 Sep 25, 2022
61cac3c
Formatting
NickM-27 Sep 25, 2022
86074e6
Add intel_gpu_top for testing
NickM-27 Sep 25, 2022
baeb3dd
Formatting
NickM-27 Sep 25, 2022
3be336a
Handle case where hwaccel is not setup
NickM-27 Sep 25, 2022
bf06f4d
Formatting
NickM-27 Sep 25, 2022
e200718
Check to remove none
NickM-27 Sep 25, 2022
412144b
Don't use set
NickM-27 Sep 26, 2022
be21a21
Cleanup and fix types
NickM-27 Sep 26, 2022
2c38913
Handle case where args is list
NickM-27 Sep 27, 2022
1062e26
Fix mypy
NickM-27 Sep 27, 2022
c28471a
Cast to str
NickM-27 Sep 27, 2022
d6e37a3
Fix type checking
NickM-27 Sep 27, 2022
ca2733a
Return none instead of empty
NickM-27 Oct 2, 2022
bc205ad
Fix organization
NickM-27 Nov 9, 2022
cc7bae6
Make keys consistent
NickM-27 Nov 10, 2022
472888c
Make gpu match style
NickM-27 Nov 13, 2022
9f1d2b9
Get support for vainfo
NickM-27 Nov 13, 2022
82e9b34
Add vainfo endpoint
NickM-27 Nov 13, 2022
51fe3dd
Set vainfo output in error correctly
NickM-27 Nov 13, 2022
67fd039
Remove duplicate function
NickM-27 Nov 13, 2022
3b3382d
Fix errors
NickM-27 Nov 13, 2022
b93eb68
Do cpu & gpu work asynchonously
NickM-27 Nov 13, 2022
39a5e25
Fix async
NickM-27 Nov 13, 2022
65e8b32
Fix event loop
NickM-27 Nov 13, 2022
1cedf05
Fix crash
NickM-27 Nov 13, 2022
35288ec
Fix naming
NickM-27 Nov 13, 2022
f172e3d
Send empty data for gpu if error occurs
NickM-27 Nov 15, 2022
35947e2
Show error if gpu stats could not be retrieved
NickM-27 Nov 15, 2022
ea63fcd
Fix mypy
NickM-27 Nov 15, 2022
8e132eb
Fix test
NickM-27 Nov 15, 2022
3cc4ce7
Don't use json for vainfo
NickM-27 Nov 19, 2022
5158e5e
Fix cross references
NickM-27 Nov 19, 2022
e4c588c
Strip unicode still
NickM-27 Nov 19, 2022
b285499
await vainfo response
NickM-27 Nov 19, 2022
46bd278
Add gpu deps
NickM-27 Nov 22, 2022
e86c23f
Formatting
NickM-27 Nov 22, 2022
196b558
remove comments
NickM-27 Nov 22, 2022
c6c9e13
Use empty string
NickM-27 Nov 22, 2022
98adac4
Add vainfo back in
NickM-27 Nov 22, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docker/install_deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ apt-get -qq install --no-install-recommends -y \
apt-transport-https \
gnupg \
wget \
procps \
procps vainfo \
unzip locales tzdata libxml2 xz-utils \
python3-pip

Expand Down Expand Up @@ -53,7 +53,7 @@ if [[ "${TARGETARCH}" == "amd64" ]]; then
echo 'deb http://deb.debian.org/debian testing main non-free' >/etc/apt/sources.list.d/debian-testing.list
apt-get -qq update
apt-get -qq install --no-install-recommends --no-install-suggests -y \
mesa-va-drivers libva-drm2 intel-media-va-driver-non-free i965-va-driver libmfx1
mesa-va-drivers libva-drm2 intel-media-va-driver-non-free i965-va-driver libmfx1 radeontop intel-gpu-tools
rm -f /etc/apt/sources.list.d/debian-testing.list
fi

Expand Down
7 changes: 7 additions & 0 deletions frigate/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,10 @@
REGEX_CAMERA_NAME = "^[a-zA-Z0-9_-]+$"
REGEX_RTSP_CAMERA_USER_PASS = ":\/\/[a-zA-Z0-9_-]+:[\S]+@"
REGEX_HTTP_CAMERA_USER_PASS = "user=[a-zA-Z0-9_-]+&password=[\S]+"

# Known Driver Names

DRIVER_ENV_VAR = "LIBVA_DRIVER_NAME"
DRIVER_AMD = "radeonsi"
DRIVER_INTEL_i965 = "i965"
DRIVER_INTEL_iHD = "iHD"
21 changes: 19 additions & 2 deletions frigate/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@
from peewee import SqliteDatabase, operator, fn, DoesNotExist
from playhouse.shortcuts import model_to_dict

from frigate.config import CameraConfig
from frigate.const import CLIPS_DIR
from frigate.models import Event, Recordings
from frigate.object_processing import TrackedObject
from frigate.stats import stats_snapshot
from frigate.util import clean_camera_user_pass, ffprobe_stream
from frigate.util import clean_camera_user_pass, ffprobe_stream, vainfo_hwaccel
from frigate.version import VERSION

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -608,7 +609,7 @@ def version():

@bp.route("/stats")
def stats():
stats = stats_snapshot(current_app.stats_tracking)
stats = stats_snapshot(current_app.frigate_config, current_app.stats_tracking)
return jsonify(stats)


Expand Down Expand Up @@ -996,3 +997,19 @@ def ffprobe():
)

return jsonify(output)


@bp.route("/vainfo", methods=["GET"])
def vainfo():
vainfo = vainfo_hwaccel()
return jsonify(
{
"return_code": vainfo.returncode,
"stderr": vainfo.stderr.decode("unicode_escape").strip()
if vainfo.stderr.decode()
else "",
"stdout": vainfo.stdout.decode("unicode_escape").strip()
if vainfo.stdout.decode()
else "",
}
)
99 changes: 95 additions & 4 deletions frigate/stats.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import json
import logging
import threading
Expand All @@ -11,8 +12,9 @@

from frigate.comms.dispatcher import Dispatcher
from frigate.config import FrigateConfig
from frigate.const import RECORD_DIR, CLIPS_DIR, CACHE_DIR
from frigate.const import DRIVER_AMD, DRIVER_ENV_VAR, RECORD_DIR, CLIPS_DIR, CACHE_DIR
from frigate.types import StatsTrackingTypes, CameraMetricsTypes
from frigate.util import get_amd_gpu_stats, get_intel_gpu_stats, get_nvidia_gpu_stats
from frigate.version import VERSION
from frigate.util import get_cpu_stats
from frigate.object_detection import ObjectDetectProcess
Expand Down Expand Up @@ -82,7 +84,96 @@ def get_temperatures() -> dict[str, float]:
return temps


def stats_snapshot(stats_tracking: StatsTrackingTypes) -> dict[str, Any]:
def get_processing_stats(config: FrigateConfig, stats: dict[str, str]) -> None:
"""Get stats for cpu / gpu."""

async def run_tasks() -> None:
await asyncio.wait(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the only reason to introduce asyncio stuff here so we can wait for these to run in parallel?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, with the cpu and 2 gpus my production server takes ~ 7-10 seconds and with this it's more like 3-5 seconds. There definitely may be better ways to parallelize it or run the actual checks in the background and cache / average the returned results so when /stats is called it can return without waiting.

[
asyncio.create_task(set_gpu_stats(config, stats)),
asyncio.create_task(set_cpu_stats(stats)),
]
)

loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(run_tasks())
loop.close()


async def set_cpu_stats(all_stats: dict[str, Any]) -> None:
"""Set cpu usage from top."""
cpu_stats = get_cpu_stats()

if cpu_stats:
all_stats["cpu_usages"] = cpu_stats


async def set_gpu_stats(config: FrigateConfig, all_stats: dict[str, Any]) -> None:
"""Parse GPUs from hwaccel args and use for stats."""
hwaccel_args = []

for camera in config.cameras.values():
args = camera.ffmpeg.hwaccel_args

if isinstance(args, list):
args = " ".join(args)

if args and args not in hwaccel_args:
hwaccel_args.append(args)

stats: dict[str, dict] = {}

for args in hwaccel_args:
if "cuvid" in args:
# nvidia GPU
nvidia_usage = get_nvidia_gpu_stats()

if nvidia_usage:
name = nvidia_usage["name"]
del nvidia_usage["name"]
stats[name] = nvidia_usage
else:
stats["nvidia-gpu"] = {"gpu": -1, "mem": -1}
elif "qsv" in args:
# intel QSV GPU
intel_usage = get_intel_gpu_stats()

if intel_usage:
stats["intel-qsv"] = intel_usage
else:
stats["intel-qsv"] = {"gpu": -1, "mem": -1}
elif "vaapi" in args:
driver = os.environ.get(DRIVER_ENV_VAR)

if driver == DRIVER_AMD:
# AMD VAAPI GPU
amd_usage = get_amd_gpu_stats()

if amd_usage:
stats["amd-vaapi"] = amd_usage
else:
stats["amd-vaapi"] = {"gpu": -1, "mem": -1}
else:
# intel VAAPI GPU
intel_usage = get_intel_gpu_stats()

if intel_usage:
stats["intel-vaapi"] = intel_usage
else:
stats["intel-vaapi"] = {"gpu": -1, "mem": -1}
elif "v4l2m2m" in args:
# RPi v4l2m2m is currently not able to get usage stats
stats["rpi-v4l2m2m"] = {"gpu": -1, "mem": -1}

if stats:
all_stats["gpu_usages"] = stats


def stats_snapshot(
config: FrigateConfig, stats_tracking: StatsTrackingTypes
) -> dict[str, Any]:
"""Get a snapshot of the current stats that are being tracked."""
camera_metrics = stats_tracking["camera_metrics"]
stats: dict[str, Any] = {}

Expand Down Expand Up @@ -119,7 +210,7 @@ def stats_snapshot(stats_tracking: StatsTrackingTypes) -> dict[str, Any]:
}
stats["detection_fps"] = round(total_detection_fps, 2)

stats["cpu_usages"] = get_cpu_stats()
get_processing_stats(config, stats)

stats["service"] = {
"uptime": (int(time.time()) - stats_tracking["started"]),
Expand Down Expand Up @@ -159,6 +250,6 @@ def __init__(
def run(self) -> None:
time.sleep(10)
while not self.stop_event.wait(self.config.mqtt.stats_interval):
stats = stats_snapshot(self.stats_tracking)
stats = stats_snapshot(self.config, self.stats_tracking)
self.dispatcher.publish("stats", json.dumps(stats), retain=False)
logger.info(f"Exiting watchdog...")
45 changes: 45 additions & 0 deletions frigate/test/test_gpu_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import unittest
from unittest.mock import MagicMock, patch

from frigate.util import get_amd_gpu_stats, get_intel_gpu_stats, get_nvidia_gpu_stats


class TestGpuStats(unittest.TestCase):
def setUp(self):
self.amd_results = "Unknown Radeon card. <= R500 won't work, new cards might.\nDumping to -, line limit 1.\n1664070990.607556: bus 10, gpu 4.17%, ee 0.00%, vgt 0.00%, ta 0.00%, tc 0.00%, sx 0.00%, sh 0.00%, spi 0.83%, smx 0.00%, cr 0.00%, sc 0.00%, pa 0.00%, db 0.00%, cb 0.00%, vram 60.37% 294.04mb, gtt 0.33% 52.21mb, mclk 100.00% 1.800ghz, sclk 26.65% 0.533ghz\n"
self.intel_results = """{"period":{"duration":1.194033,"unit":"ms"},"frequency":{"requested":0.000000,"actual":0.000000,"unit":"MHz"},"interrupts":{"count":3349.991164,"unit":"irq/s"},"rc6":{"value":47.844741,"unit":"%"},"engines":{"Render/3D/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Blitter/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Video/0":{"busy":4.533124,"sema":0.000000,"wait":0.000000,"unit":"%"},"Video/1":{"busy":6.194385,"sema":0.000000,"wait":0.000000,"unit":"%"},"VideoEnhance/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"}}},{"period":{"duration":1.189291,"unit":"ms"},"frequency":{"requested":0.000000,"actual":0.000000,"unit":"MHz"},"interrupts":{"count":0.000000,"unit":"irq/s"},"rc6":{"value":100.000000,"unit":"%"},"engines":{"Render/3D/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Blitter/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Video/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Video/1":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"VideoEnhance/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"}}}"""
self.nvidia_results = "name, utilization.gpu [%], memory.used [MiB], memory.total [MiB]\nNVIDIA GeForce RTX 3050, 42 %, 5036 MiB, 8192 MiB\n"

@patch("subprocess.run")
def test_amd_gpu_stats(self, sp):
process = MagicMock()
process.returncode = 0
process.stdout = self.amd_results
sp.return_value = process
amd_stats = get_amd_gpu_stats()
assert amd_stats == {"gpu": "4.17 %", "mem": "60.37 %"}

@patch("subprocess.run")
def test_nvidia_gpu_stats(self, sp):
process = MagicMock()
process.returncode = 0
process.stdout = self.nvidia_results
sp.return_value = process
nvidia_stats = get_nvidia_gpu_stats()
assert nvidia_stats == {
"name": "NVIDIA GeForce RTX 3050",
"gpu": "42 %",
"mem": "61.5 %",
}

@patch("subprocess.run")
def test_intel_gpu_stats(self, sp):
process = MagicMock()
process.returncode = 0
process.stdout = self.intel_results
sp.return_value = process
intel_stats = get_intel_gpu_stats()
assert intel_stats == {
"gpu": "10.73 %",
"mem": "- %",
}
105 changes: 105 additions & 0 deletions frigate/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,105 @@ def get_cpu_stats() -> dict[str, dict]:
return usages


def get_amd_gpu_stats() -> dict[str, str]:
"""Get stats using radeontop."""
radeontop_command = ["radeontop", "-d", "-", "-l", "1"]

p = sp.run(
radeontop_command,
encoding="ascii",
capture_output=True,
)

if p.returncode != 0:
logger.error(p.stderr)
return None
else:
usages = p.stdout.split(",")
results: dict[str, str] = {}

for hw in usages:
if "gpu" in hw:
results["gpu"] = f"{hw.strip().split(' ')[1].replace('%', '')} %"
elif "vram" in hw:
results["mem"] = f"{hw.strip().split(' ')[1].replace('%', '')} %"

return results


def get_intel_gpu_stats() -> dict[str, str]:
"""Get stats using intel_gpu_top."""
intel_gpu_top_command = [
"timeout",
"0.1s",
"intel_gpu_top",
"-J",
"-o",
"-",
"-s",
"1",
]

p = sp.run(
intel_gpu_top_command,
encoding="ascii",
capture_output=True,
)

if p.returncode != 0:
logger.error(p.stderr)
return None
else:
readings = json.loads(f"[{p.stdout}]")
results: dict[str, str] = {}

for reading in readings:
if reading.get("engines", {}).get("Video/0", {}).get(
"busy", 0
) or reading.get("engines", {}).get("Video/1", {}).get("busy", 0):
gpu_usage = round(
float(reading.get("engines", {}).get("Video/0", {}).get("busy", 0))
+ float(
reading.get("engines", {}).get("Video/1", {}).get("busy", 0)
),
2,
)
results["gpu"] = f"{gpu_usage} %"
break

results["mem"] = "- %"
return results


def get_nvidia_gpu_stats() -> dict[str, str]:
"""Get stats using nvidia-smi."""
nvidia_smi_command = [
"nvidia-smi",
"--query-gpu=gpu_name,utilization.gpu,memory.used,memory.total",
"--format=csv",
]

p = sp.run(
nvidia_smi_command,
encoding="ascii",
capture_output=True,
)

if p.returncode != 0:
logger.error(p.stderr)
return None
else:
usages = p.stdout.split("\n")[1].strip().split(",")
memory_percent = f"{round(float(usages[2].replace(' MiB', '').strip()) / float(usages[3].replace(' MiB', '').strip()) * 100, 1)} %"
results: dict[str, str] = {
"name": usages[0],
"gpu": usages[1].strip(),
"mem": memory_percent,
}

return results


def ffprobe_stream(path: str) -> sp.CompletedProcess:
"""Run ffprobe on stream."""
ffprobe_cmd = [
Expand All @@ -781,6 +880,12 @@ def ffprobe_stream(path: str) -> sp.CompletedProcess:
return sp.run(ffprobe_cmd, capture_output=True)


def vainfo_hwaccel() -> sp.CompletedProcess:
"""Run vainfo."""
ffprobe_cmd = ["vainfo"]
return sp.run(ffprobe_cmd, capture_output=True)


class FrameManager(ABC):
@abstractmethod
def create(self, name, size) -> AnyStr:
Expand Down
Loading