diff --git a/CHANGELOG.md b/CHANGELOG.md index 90093cfd4386cc6101b49de4ff5ce9751742872d..8c0fecb14c5cf93c6f1f4704f3d5a8bafbe20165 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [1.2.2] - 2022-06-07 +### Added +- gpumon: avoid crashing on no GPU (readlink) + ## [1.2.1] - 2022-06-07 ### Added - gpumon: avoid crashing on no GPU diff --git a/src/metric-generators/gpumon.sh b/src/metric-generators/gpumon.sh index a262c23e32d5a1c9aad651fe011ae57723b75a6a..c2c8549dde15d5ab4a8771f4cc01c863a39aa2e4 100755 --- a/src/metric-generators/gpumon.sh +++ b/src/metric-generators/gpumon.sh @@ -56,26 +56,27 @@ fi # browse the GPUs and export metrics METRIC_NAME="gpumon_device_state_code" -STAGE_NAME="GPU cards ${METRIC_NAME} metrics generated" +STAGE_NAME="GPU devices ${METRIC_NAME} metrics generated" if [ -n "${GPU_DEVICES}" ]; then get_metric_help "${METRIC_NAME}" "gauge" "GPU device state code (0/1 ~ available free/unavailable used)." + echo "${GPU_DEVICES}" | \ + while read i_gpu_device ; do + STAGE_NAME="GPU device ${METRIC_NAME} metrics generated (${i_gpu_device})" + [ -z "${i_gpu_device}" ] && \ + continue + i_gpu_device_location="$(echo "${i_gpu_device}" | awk '{printf $1}')" + i_gpu_device_vendor="$(echo "${i_gpu_device}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{printf $2}' | tr -d '"')" + i_gpu_device_model="$(echo "${i_gpu_device}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{printf $3}' | tr -d '"')" + i_gpu_device_revision="$(echo "${i_gpu_device}" | grep -Eo -- "-r[^ \t]+")" + i_gpu_attached=0 + i_vm_domain_name="" + if lspci -v -s "${i_gpu_device_location}" | grep -qE 'Kernel driver in use: .+'; then + i_gpu_attached=1 + i_vm_domain_file="$(find_gpu_passthough_vm_manifest "${i_gpu_device_location}")" + i_vm_domain_name="$(basename $(echo "${i_vm_domain_file}" | head -1) | sed 's/.xml//g')" + fi + printf '%s{device="%s",vendor="%s",location="%s",revision="%s",domain="%s"} %d\n' "${METRIC_NAME}" \ + "${i_gpu_device_model}" "${i_gpu_device_vendor}" "${i_gpu_device_location}" \ + "${i_gpu_device_revision:2}" "${i_vm_domain_name}" "${i_gpu_attached}" + done fi -echo "${GPU_DEVICES}" | \ - while read i_gpu_device ; do - [ -z "${i_gpu_device}" ] && \ - continue - i_gpu_device_location="$(echo "${i_gpu_device}" | awk '{printf $1}')" - i_gpu_device_vendor="$(echo "${i_gpu_device}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{printf $2}' | tr -d '"')" - i_gpu_device_model="$(echo "${i_gpu_device}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{printf $3}' | tr -d '"')" - i_gpu_device_revision="$(echo "${i_gpu_device}" | grep -Eo -- "-r[^ \t]+")" - i_gpu_attached=0 - i_vm_domain_name="" - if lspci -v -s "${i_gpu_device_location}" | grep -qE 'Kernel driver in use: .+'; then - i_gpu_attached=1 - i_vm_domain_file="$(find_gpu_passthough_vm_manifest "${i_gpu_device_location}")" - i_vm_domain_name="$(basename $(echo "${i_vm_domain_file}" | head -1) | sed 's/.xml//g')" - fi - printf '%s{device="%s",vendor="%s",location="%s",revision="%s",domain="%s"} %d\n' "${METRIC_NAME}" \ - "${i_gpu_device_model}" "${i_gpu_device_vendor}" "${i_gpu_device_location}" \ - "${i_gpu_device_revision:2}" "${i_vm_domain_name}" "${i_gpu_attached}" - done