Skip to content

Commit

Permalink
[SWDEV-398070] Adding logging to ROCm SMI (by default off)
Browse files Browse the repository at this point in the history
Updates:
    * [rocm-smi] Provide a thread-safe logging feature
    * [rocm-smi] Adding logrotation into install/upgrade/remove
      scripts
    * [rocm-smi] Updated cmake lists to include rocm_smi_logger
    * [rocm-smi] Updated DEB/RPM install/remove logging file &
      folder with all users having r/w privledges for
      /var/log/rocm_smi_lib/ROCm-SMI-lib.log
    * [rocm-smi] Added ability to do a glob search for multiple files
      (globFileExists), assists doing file searches with * strings
    * [rocm-smi] Added ability to log system details when RSMI_LOGGING
      is turned on (getSystemDetails())
    * [rocm-smi] Added logging to provide which ROCm API is being called
      when RSMI_LOGGING is on
    * [rocm-smi] Added logging to provide SYSFS path and read value,
      when RSMI_LOGGING is on. Provides error reponse on failure.
    * [rocm-smi] Added logging to provide SYSFS path and read value,
      when RSMI_LOGGING is on. Provides error reponse on failure.
    * [rocm-smi] Added environment variable RSMI_LOGGING to control
      when logging is enabled or disabled. By default, by not
      setting this env. variable, logging is turned off. When
      setting RSMI_LOGGING=<any value>, logging is enabled
      which is placed in /var/log/rocm_smi_lib/ROCm-SMI-lib.log file.
      Setting RSMI_LOGGING is allowed in both debug and release builds.
    * [rocm-smi] Removed an initialize procedure which keeps
      debug_inf_loop. Seems this feature is not being used.

Change-Id: I79b48387609c6233c6f05b04fb8bba66b68c2399
Signed-off-by: Charis Poag <[email protected]>
  • Loading branch information
charis-poag-amd committed May 18, 2023
1 parent ed74bc6 commit c3a095a
Show file tree
Hide file tree
Showing 16 changed files with 1,591 additions and 13 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_kfd.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_io_link.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_gpu_metrics.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_logger.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${SHR_MUTEX_DIR}/shared_mutex.cc")

set(CMN_INC_LIST "${COMMON_INC_DIR}/rocm_smi_device.h")
Expand All @@ -141,6 +142,7 @@ set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_counters.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_kfd.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_io_link.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_logger.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${SHR_MUTEX_DIR}/shared_mutex.h")

## set components
Expand Down
97 changes: 96 additions & 1 deletion DEBIAN/postinst.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,99 @@
#!/bin/bash

set -e
#set -x

do_addLogFolder() {
sudo mkdir -p /var/log/rocm_smi_lib
sudo touch /var/log/rocm_smi_lib/ROCm-SMI-lib.log
sudo chmod -R a+rw /var/log/rocm_smi_lib
sudo chmod a+rw /var/log/rocm_smi_lib/ROCm-SMI-lib.log
}

do_configureLogrotate() {
logrotate --version &>/dev/null
if [ $? -ne 0 ]; then
echo "[WARNING] Detected logrotate is not installed."\
"ROCm-smi logs (when turned on) will not rotate properly."
return
fi

if [ ! -f /etc/logrotate.d/rocm_smi.conf ]; then
sudo touch /etc/logrotate.d/rocm_smi.conf
sudo chmod 644 /etc/logrotate.d/rocm_smi.conf # root r/w, all others read
# ROCm SMI logging rotation, rotates files using root user/group
# Hourly logrotation check
# Only rotates if size grew larger than 1MB
# Max of 4 rotation files, oldest will be removed
# Rotated files use date extention of ex. ROCm-SMI-lib.log.2023-05-09_16:51:42
cat <<'EOF' | sudo tee /etc/logrotate.d/rocm_smi.conf >/dev/null
/var/log/rocm_smi_lib/ROCm-SMI-lib.log {
su root root
hourly
missingok
notifempty
rotate 4
size 1M
copytruncate
dateext
dateformat .%Y-%m-%d_%H:%M:%S
}
EOF
# workaround: remove extra 'OURCE' text
# from rocm_smi.conf. Unsure if CMAKE,
# bash, or here document
# issue (only seen on RHEL 8.7)
sudo sed -i s/OURCE//g /etc/logrotate.d/rocm_smi.conf
fi
# check if logrotate uses system timers, Ubuntu/modern OS's do
# Several older OS's like RHEL 8.7, do not. Instead defaults
# to use daily cron jobs - see https://stackoverflow.com/a/69465677
sudo systemctl list-timers|grep -iq logrotate
if [ $? -ne 0 ]; then
# confirm logrotate file exists in daily
if [ -f /etc/cron.daily/logrotate ]; then
# move logrotate daily to hourly
if [ -f /etc/cron.hourly/logrotate ]; then
sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
else
echo "[WARNING] Could find and configure hourly cron for ROCm-smi's"\
" logrotate. ROCm-smi logs (when turned on) will not rotate properly."
return
fi
else
# confirm that it's already been moved to hourly
sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly
if [ $? -ne 0 ]; then
echo "[WARNING] Could not configure an hourly cron for ROCm-smi's logrotate."\
"ROCm-smi logs (when turned on) may not rotate properly."
fi
fi
else
# Configure systemd timers - the typical setup for modern Linux logrotation setups
if [ -f /lib/systemd/system/logrotate.timer ]; then
if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then
sudo cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup
fi
cat <<'EOF' | sudo tee /lib/systemd/system/logrotate.timer >/dev/null
[Unit]
Description=Hourly rotation of log files
Documentation=man:logrotate(8) man:logrotate.conf(5)
[Timer]
OnCalendar=
OnCalendar=hourly
AccuracySec=1m
Persistent=true
[Install]
WantedBy=timers.target
EOF
sudo systemctl reenable --now logrotate.timer
else
echo "[WARNING] Could not configure systemd timer for ROCm's logrotate."\
"ROCm-smi logs (when turned on) will not rotate properly."
fi
fi
}

do_ldconfig() {
# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
Expand All @@ -13,6 +106,8 @@ do_ldconfig() {
case "$1" in
( configure )
do_ldconfig
do_addLogFolder
do_configureLogrotate
;;
( abort-upgrade | abort-remove | abort-deconfigure )
echo "$1"
Expand Down
20 changes: 20 additions & 0 deletions DEBIAN/prerm.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,24 @@

set -e

rm_logFolder() {
sudo rm -rf /var/log/rocm_smi_lib
}

return_logrotateToOrigConfig() {
if [ -f /etc/logrotate.d/rocm_smi.conf ]; then
sudo rm -rf /etc/logrotate.d/rocm_smi.conf
fi
if [ -f /etc/cron.hourly/logrotate ]; then
sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate
fi
if [ -f /lib/systemd/system/logrotate.timer.backup ]; then
sudo cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer
sudo rm -rf /lib/systemd/system/logrotate.timer.backup
sudo systemctl reenable --now logrotate.timer
fi
}

rm_ldconfig() {
# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
Expand All @@ -19,6 +37,8 @@ case "$1" in
( remove | upgrade)
rm_ldconfig
rm_pyc
rm_logFolder
return_logrotateToOrigConfig
;;
( purge )
;;
Expand Down
102 changes: 102 additions & 0 deletions RPM/post.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,107 @@
#!/bin/bash
#set -x

do_addLogFolder() {
sudo mkdir -p /var/log/rocm_smi_lib
sudo touch /var/log/rocm_smi_lib/ROCm-SMI-lib.log
sudo chmod -R a+rw /var/log/rocm_smi_lib
sudo chmod a+rw /var/log/rocm_smi_lib/ROCm-SMI-lib.log
}

do_configureLogrotate() {
logrotate --version &>/dev/null
if [ $? -ne 0 ]; then
echo "[WARNING] Detected logrotate is not installed."\
"ROCm-smi logs (when turned on) will not rotate properly."
return
fi

if [ ! -f /etc/logrotate.d/rocm_smi.conf ]; then
sudo touch /etc/logrotate.d/rocm_smi.conf
sudo chmod 644 /etc/logrotate.d/rocm_smi.conf # root r/w, all others read
# ROCm SMI logging rotation, rotates files using root user/group
# Hourly logrotation check
# Only rotates if size grew larger than 1MB
# Max of 4 rotation files, oldest will be removed
# Rotated files use date extention of ex. ROCm-SMI-lib.log.2023-05-09_16:51:42
cat <<'EOF' | sudo tee /etc/logrotate.d/rocm_smi.conf >/dev/null
/var/log/rocm_smi_lib/ROCm-SMI-lib.log {
su root root
hourly
missingok
notifempty
rotate 4
size 1M
copytruncate
dateext
dateformat .%Y-%m-%d_%H:%M:%S
}
EOF
# workaround: remove extra 'OURCE' text
# from rocm_smi.conf. Unsure if CMAKE,
# bash, or here document
# issue (only seen on RHEL 8.7)
sudo sed -i s/OURCE//g /etc/logrotate.d/rocm_smi.conf
fi
# check if logrotate uses system timers, Ubuntu/modern OS's do
# Several older OS's like RHEL 8.7, do not. Instead defaults
# to use daily cron jobs - see https://stackoverflow.com/a/69465677
sudo systemctl list-timers|grep -iq logrotate
if [ $? -ne 0 ]; then
# confirm logrotate file exists in daily
if [ -f /etc/cron.daily/logrotate ]; then
# move logrotate daily to hourly
if [ -f /etc/cron.hourly/logrotate ]; then
sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
else
echo "[WARNING] Could find and configure hourly cron for ROCm-smi's"\
" logrotate. ROCm-smi logs (when turned on) will not rotate properly."
return
fi
else
# confirm that it's already been moved to hourly
sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly
if [ $? -ne 0 ]; then
echo "[WARNING] Could not configure an hourly cron for ROCm-smi's logrotate."\
"ROCm-smi logs (when turned on) may not rotate properly."
fi
fi
else
# Configure systemd timers - the typical setup for modern Linux logrotation setups
if [ -f /lib/systemd/system/logrotate.timer ]; then
if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then
sudo cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup
fi
cat <<'EOF' | sudo tee /lib/systemd/system/logrotate.timer >/dev/null
[Unit]
Description=Hourly rotation of log files
Documentation=man:logrotate(8) man:logrotate.conf(5)
[Timer]
OnCalendar=
OnCalendar=hourly
AccuracySec=1m
Persistent=true
[Install]
WantedBy=timers.target
EOF
sudo systemctl reenable --now logrotate.timer
else
echo "[WARNING] Could not configure systemd timer for ROCm's logrotate."\
"ROCm-smi logs (when turned on) will not rotate properly."
fi
fi
}

# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
echo -e "@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@" > /etc/ld.so.conf.d/x86_64-librocm_smi_lib.conf
ldconfig
fi

# post install or upgrade, $i is 1 or 2 -> do these actions
if [ $1 -ge 1 ]; then
do_addLogFolder
do_configureLogrotate
fi
2 changes: 2 additions & 0 deletions RPM/postun.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

# second term originates from ENABLE_LDCONFIG = ON/OFF at package build
if [ $1 -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then
# perform the below actions for rpm remove($1=0) or upgrade($1=1) operations
Expand Down
23 changes: 23 additions & 0 deletions RPM/preun.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,28 @@
#!/bin/bash
#set -x

rm_logFolder() {
sudo rm -rf /var/log/rocm_smi_lib
}

return_logrotateToOrigConfig() {
if [ -f /etc/logrotate.d/rocm_smi.conf ]; then
sudo rm -rf /etc/logrotate.d/rocm_smi.conf
fi
if [ -f /etc/cron.hourly/logrotate ]; then
sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate
fi
if [ -f /lib/systemd/system/logrotate.timer.backup ]; then
sudo cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer
sudo rm -rf /lib/systemd/system/logrotate.timer.backup
sudo systemctl reenable --now logrotate.timer
fi
}

if [ $1 -le 1 ]; then
# perform the below actions for rpm remove($1=0) or upgrade($1=1) operations
# remove pyc file generated by python
rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rocm_smi/__pycache__
rm_logFolder
return_logrotateToOrigConfig
fi
4 changes: 4 additions & 0 deletions include/rocm_smi/rocm_smi_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,10 @@ struct RocmSMI_env_vars {
// comma delimited values.
std::unordered_set<uint32_t> enum_overrides;

// If RSMI_LOGGING is set, enables logging.
// Otherwise unset values, signify logging is turned off.
uint32_t logging_on;

// Sysfs path overrides

// Env. var. RSMI_DEBUG_DRM_ROOT_OVERRIDE
Expand Down
Loading

0 comments on commit c3a095a

Please sign in to comment.