Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support more info #668

Merged
merged 2 commits into from
Aug 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 23 additions & 4 deletions swanlab/data/run/system/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __replace_second_colon(input_string, replacement):
if first_colon_index != -1:
second_colon_index = input_string.find(":", first_colon_index + 1)
if second_colon_index != -1:
return input_string[:second_colon_index] + replacement + input_string[second_colon_index + 1:]
return input_string[:second_colon_index] + replacement + input_string[second_colon_index + 1 :]
return input_string


Expand Down Expand Up @@ -90,13 +90,15 @@ def __get_git_branch_and_commit():

def __get_nvidia_gpu_info():
"""获取 GPU 信息"""
info = {"cores": None, "type": [], "memory": []}
info = {"driver": None, "cores": None, "type": [], "memory": []}
try:
pynvml.nvmlInit()
except:
return None

try:
# 获取 NVIDIA 驱动版本信息
info["driver"] = pynvml.nvmlSystemGetDriverVersion()
# 获取 NVIDIA GPU 数量
info["cores"] = pynvml.nvmlDeviceGetCount()
# 遍历每个 GPU,获取 GPU 信息
Expand All @@ -108,7 +110,7 @@ def __get_nvidia_gpu_info():
gpu_name = gpu_name.decode("utf-8")
info["type"].append(gpu_name)
# 获取 GPU 的总显存, 单位为GB
info["memory"].append(round(pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024 ** 3)))
info["memory"].append(round(pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**3)))

except pynvml.NVMLError as e:
swanlog.debug(f"An error occurred when getting GPU info: {e}")
Expand Down Expand Up @@ -190,7 +192,7 @@ def __get_memory_size():
try:
# 获取系统总内存大小
mem = psutil.virtual_memory()
total_memory = round(mem.total / (1024 ** 3)) # 单位为GB
total_memory = round(mem.total / (1024**3)) # 单位为GB
return total_memory
except Exception as e:
swanlog.debug(f"An error occurred when getting memory size: {e}")
Expand Down Expand Up @@ -226,6 +228,23 @@ def get_requirements() -> str:
return None


def get_conda_env() -> str:
"""获取当前项目下conda环境"""
try:
# 运行pip命令获取当前环境下的环境目录
result = subprocess.run(["conda", "list"], stdout=subprocess.PIPE, text=True)

# 检查命令是否成功运行
if result.returncode == 0:
return result.stdout
else:
swanlog.debug(f"An error occurred when getting conda env:{result.stderr}")
return None
except Exception as e:
swanlog.debug(f"An error occurred when getting conda env: {e}")
return None


def get_system_info(version: str, logdir: str):
"""获取系统信息
:param version: swanlab版本号
Expand Down
39 changes: 39 additions & 0 deletions swanlab/data/run/system/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class SwanSystemMonitor:
memory: 内存数据,包含内存剩余量,使用量和使用率
system_cpu_usage: 系统CPU使用率
process_cpu_usage: 当前进程CPU使用率
system_temperature: 当前系统所有传感器的温度
system_cpu_freq: 当前系统CPU的频率
timestamp: 当前时间
"""

Expand Down Expand Up @@ -182,6 +184,41 @@ def get_process_cpu_usage(self):

return process_cpu_usage


def get_system_cpu_freq(self):
"""
得到当前时间系统CPU的频率
---
信息:
单位: Mhz
精确度: 不确定
可用性: Linux,macOS,Windows,FreeBSD,OpenBSD

WARNING:在linux上,current获取实时值,其他平台上则是固定值
"""
cpu_freq = psutil.cpu_freq(percpu=False)
return cpu_freq

def get_system_temperature(self):
"""
得到当前时间系统各传感器的温度
---
信息:
单位: 摄氏度℃
精确度: 小数点后两位
可用性: Linux,FreeBSD

Return:
temperatures: {"device1":[],"device2":[],...}

TODO:传感器可能的返回参数https://www.kernel.org/doc/html/latest/subsystem-apis.html
已知:k10temp(AMD CPU 10th~16th Opteron~zen3)
"""
if not hasattr(psutil, "sensors_temperatures"):return "platform not supported"
temperatures = psutil.sensors_temperatures()
if not temperatures:return "can't read any temperature"
return temperatures

def get_all(self):
"""获取全部硬件数据"""
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
Expand All @@ -194,6 +231,8 @@ def get_all(self):
"memory": self.get_memory_usage(),
"system_cpu_usage": self.get_system_cpu_usage(),
"process_cpu_usage": self.get_process_cpu_usage(),
"system_temperature":self.get_system_temperature(),
'system_cpu_freq':self.get_system_cpu_freq(),
"timestamp": timestamp,
}

Expand Down