Skip to content
Snippets Groups Projects
Commit 0c515077 authored by František Řezníček's avatar František Řezníček
Browse files

fix: gpumon no-GPU execution fixes

parent 93b28c38
No related branches found
No related tags found
No related merge requests found
Pipeline #161300 passed
...@@ -6,6 +6,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -6,6 +6,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
## [1.2.1] - 2022-06-07
### Added
- gpumon: avoid crashing on no GPU
- gpumon: avoid reporting GPU metric help on no GPU
## [1.2.0] - 2022-06-07 ## [1.2.0] - 2022-06-07
### Added ### Added
- GPU presence and availablility added as gpumon metric generator - GPU presence and availablility added as gpumon metric generator
......
...@@ -17,7 +17,9 @@ source "${CMG_SRC_DIR}/lib.sh" ...@@ -17,7 +17,9 @@ source "${CMG_SRC_DIR}/lib.sh"
# get_gpu_devices [lspci-additional-args] # get_gpu_devices [lspci-additional-args]
# list PCI devices one per line # list PCI devices one per line
function get_gpu_devices() { function get_gpu_devices() {
lspci -D -mm "$@" | grep -i nvidia | grep -Ei "(VGA|2D|3D).+controller" if ! lspci -D -mm "$@" | grep -i nvidia | grep -Ei "(VGA|2D|3D).+controller"; then
true
fi
} }
# find_gpu_passthough_vm_manifest <gpu-location> [libvirtd-qemu-dir] # find_gpu_passthough_vm_manifest <gpu-location> [libvirtd-qemu-dir]
...@@ -30,7 +32,7 @@ function find_gpu_passthough_vm_manifest () { ...@@ -30,7 +32,7 @@ function find_gpu_passthough_vm_manifest () {
local gpu_slot=$(echo -n "${gpu_location}" | sed 's/[:.]/ /g' | awk '{printf $3}') local gpu_slot=$(echo -n "${gpu_location}" | sed 's/[:.]/ /g' | awk '{printf $3}')
local gpu_function=$(echo -n "${gpu_location}" | sed 's/[:.]/ /g' | awk '{printf $4}') local gpu_function=$(echo -n "${gpu_location}" | sed 's/[:.]/ /g' | awk '{printf $4}')
test -d ${dir} || return 0 test -d "${dir}" || return 0
for i_gpu_manifest_file in $(ls "${dir}"/*.xml); do for i_gpu_manifest_file in $(ls "${dir}"/*.xml); do
if grep -i "domain=['\"]0x${gpu_domain}['\"]" "${i_gpu_manifest_file}" | \ if grep -i "domain=['\"]0x${gpu_domain}['\"]" "${i_gpu_manifest_file}" | \
...@@ -55,7 +57,9 @@ fi ...@@ -55,7 +57,9 @@ fi
# browse the GPUs and export metrics # browse the GPUs and export metrics
METRIC_NAME="gpumon_device_state_code" METRIC_NAME="gpumon_device_state_code"
STAGE_NAME="GPU cards ${METRIC_NAME} metrics generated" STAGE_NAME="GPU cards ${METRIC_NAME} metrics generated"
get_metric_help "${METRIC_NAME}" "gauge" "GPU device state code (0/1 ~ available free/unavailable used)." if [ -n "${GPU_DEVICES}" ]; then
get_metric_help "${METRIC_NAME}" "gauge" "GPU device state code (0/1 ~ available free/unavailable used)."
fi
echo "${GPU_DEVICES}" | \ echo "${GPU_DEVICES}" | \
while read i_gpu_device ; do while read i_gpu_device ; do
[ -z "${i_gpu_device}" ] && \ [ -z "${i_gpu_device}" ] && \
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment