From 0c31fcab0fb35c292d5ce4b72e5e1b50a1c5d5be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franti=C5=A1ek=20=C5=98ezn=C3=AD=C4=8Dek?= <246254@mail.muni.cz> Date: Tue, 7 Jun 2022 17:01:43 +0200 Subject: [PATCH] fix: avoid crashing on read(line) when no GPU found --- CHANGELOG.md | 4 ++++ src/metric-generators/gpumon.sh | 41 +++++++++++++++++---------------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 90093cf..8c0fecb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [1.2.2] - 2022-06-07 +### Added +- gpumon: avoid crashing on no GPU (readlink) + ## [1.2.1] - 2022-06-07 ### Added - gpumon: avoid crashing on no GPU diff --git a/src/metric-generators/gpumon.sh b/src/metric-generators/gpumon.sh index a262c23..c2c8549 100755 --- a/src/metric-generators/gpumon.sh +++ b/src/metric-generators/gpumon.sh @@ -56,26 +56,27 @@ fi # browse the GPUs and export metrics METRIC_NAME="gpumon_device_state_code" -STAGE_NAME="GPU cards ${METRIC_NAME} metrics generated" +STAGE_NAME="GPU devices ${METRIC_NAME} metrics generated" if [ -n "${GPU_DEVICES}" ]; then get_metric_help "${METRIC_NAME}" "gauge" "GPU device state code (0/1 ~ available free/unavailable used)." + echo "${GPU_DEVICES}" | \ + while read i_gpu_device ; do + STAGE_NAME="GPU device ${METRIC_NAME} metrics generated (${i_gpu_device})" + [ -z "${i_gpu_device}" ] && \ + continue + i_gpu_device_location="$(echo "${i_gpu_device}" | awk '{printf $1}')" + i_gpu_device_vendor="$(echo "${i_gpu_device}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{printf $2}' | tr -d '"')" + i_gpu_device_model="$(echo "${i_gpu_device}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{printf $3}' | tr -d '"')" + i_gpu_device_revision="$(echo "${i_gpu_device}" | grep -Eo -- "-r[^ \t]+")" + i_gpu_attached=0 + i_vm_domain_name="" + if lspci -v -s "${i_gpu_device_location}" | grep -qE 'Kernel driver in use: .+'; then + i_gpu_attached=1 + i_vm_domain_file="$(find_gpu_passthough_vm_manifest "${i_gpu_device_location}")" + i_vm_domain_name="$(basename $(echo "${i_vm_domain_file}" | head -1) | sed 's/.xml//g')" + fi + printf '%s{device="%s",vendor="%s",location="%s",revision="%s",domain="%s"} %d\n' "${METRIC_NAME}" \ + "${i_gpu_device_model}" "${i_gpu_device_vendor}" "${i_gpu_device_location}" \ + "${i_gpu_device_revision:2}" "${i_vm_domain_name}" "${i_gpu_attached}" + done fi -echo "${GPU_DEVICES}" | \ - while read i_gpu_device ; do - [ -z "${i_gpu_device}" ] && \ - continue - i_gpu_device_location="$(echo "${i_gpu_device}" | awk '{printf $1}')" - i_gpu_device_vendor="$(echo "${i_gpu_device}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{printf $2}' | tr -d '"')" - i_gpu_device_model="$(echo "${i_gpu_device}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{printf $3}' | tr -d '"')" - i_gpu_device_revision="$(echo "${i_gpu_device}" | grep -Eo -- "-r[^ \t]+")" - i_gpu_attached=0 - i_vm_domain_name="" - if lspci -v -s "${i_gpu_device_location}" | grep -qE 'Kernel driver in use: .+'; then - i_gpu_attached=1 - i_vm_domain_file="$(find_gpu_passthough_vm_manifest "${i_gpu_device_location}")" - i_vm_domain_name="$(basename $(echo "${i_vm_domain_file}" | head -1) | sed 's/.xml//g')" - fi - printf '%s{device="%s",vendor="%s",location="%s",revision="%s",domain="%s"} %d\n' "${METRIC_NAME}" \ - "${i_gpu_device_model}" "${i_gpu_device_vendor}" "${i_gpu_device_location}" \ - "${i_gpu_device_revision:2}" "${i_vm_domain_name}" "${i_gpu_attached}" - done -- GitLab