Skip to content
Snippets Groups Projects
Commit 93b28c38 authored by František Řezníček's avatar František Řezníček
Browse files

Merge branch 'feat/gpumon' into 'master'

feat: gpumon generator provides GPU availability metric

See merge request !6
parents 2fba3ecb 9a5cef88
No related branches found
No related tags found
1 merge request!6feat: gpumon generator provides GPU availability metric
Pipeline #161255 passed
......@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
## [1.2.0] - 2022-06-07
### Added
- GPU presence and availablility added as gpumon metric generator
## [1.1.1] - 2022-05-03
### Fixed
- Fix chmod for gatewaypingmon
......
......@@ -5,3 +5,4 @@ docker-client
curl
cronie
smartmontools
pciutils
......@@ -11,7 +11,7 @@ CMG_MAX_JITTER_DELAY="${CMG_MAX_JITTER_DELAY:-"3"}"
CMG_OUT_METRICS_DIR="${CMG_OUT_METRICS_DIR:-"${CMG_BASE_DIR}/out-metrics-dir"}"
CMG_STDOUT_LOG="${CMG_STDOUT_LOG:-"${CMG_BASE_DIR}/custom-metrics-generator.stdout.log"}"
CMG_STDERR_LOG="${CMG_STDERR_LOG:-"${CMG_BASE_DIR}/custom-metrics-generator.stderr.log"}"
CMG_EXEC_MODULES="${CMG_EXEC_MODULES:-"smartmon dockermon puppetmon noderolemon gatewaypingmon"}"
CMG_EXEC_MODULES="${CMG_EXEC_MODULES:-"smartmon dockermon puppetmon noderolemon gatewaypingmon gpumon"}"
CMG_NODEROLEMON_METRICS_TEXTFILE=${CMG_NODEROLEMON_METRICS_TEXTFILE:-"/etc/node-exporter/node-role.prom"}
# load the library
......
#!/usr/bin/env bash
# prometheus textfile metrics generator for node-exporter textfile collector
# Monitors: gpu device status
set -eo pipefail
CMG_SRC_DIR=$(dirname $(dirname $(readlink -f $0)))
STAGE_NAME="configuration loaded"
source "${CMG_SRC_DIR}/../custom-metrics-generator.conf.env"
STAGE_NAME="library loaded"
source "${CMG_SRC_DIR}/lib.sh"
# local functions
# get_gpu_devices [lspci-additional-args]
# list PCI devices one per line
function get_gpu_devices() {
lspci -D -mm "$@" | grep -i nvidia | grep -Ei "(VGA|2D|3D).+controller"
}
# find_gpu_passthough_vm_manifest <gpu-location> [libvirtd-qemu-dir]
function find_gpu_passthough_vm_manifest () {
local gpu_location="$1"
local dir="${2:-"/etc/libvirt/qemu"}"
local gpu_domain=$(echo -n "${gpu_location}" | sed 's/[:.]/ /g' | awk '{printf $1}')
local gpu_bus=$(echo -n "${gpu_location}" | sed 's/[:.]/ /g' | awk '{printf $2}')
local gpu_slot=$(echo -n "${gpu_location}" | sed 's/[:.]/ /g' | awk '{printf $3}')
local gpu_function=$(echo -n "${gpu_location}" | sed 's/[:.]/ /g' | awk '{printf $4}')
test -d ${dir} || return 0
for i_gpu_manifest_file in $(ls "${dir}"/*.xml); do
if grep -i "domain=['\"]0x${gpu_domain}['\"]" "${i_gpu_manifest_file}" | \
grep -i "bus=['\"]0x${gpu_bus}['\"]" | \
grep -i "slot=['\"]0x${gpu_slot}['\"]" | \
grep -iq "function=['\"]0x${gpu_function}['\"]"; then
echo "${i_gpu_manifest_file}"
fi
done
}
# gather all GPU cards (assure device model name is detected)
STAGE_NAME="GPU cards successfully detected"
GPU_DEVICES="$(get_gpu_devices)"
GPU_DEVICES_UNRECOGNIZED="$(echo "${GPU_DEVICES}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{print $3}' | \
sort -u | tr -d '"' | grep -Ev '.+[ \t]\[.+\]')"
if [ -n "${GPU_DEVICES_UNRECOGNIZED}" ]; then
# reload GPU devices and resolve proper device names with internet PCI id database
GPU_DEVICES="$(get_gpu_devices -q)"
fi
# browse the GPUs and export metrics
METRIC_NAME="gpumon_device_state_code"
STAGE_NAME="GPU cards ${METRIC_NAME} metrics generated"
get_metric_help "${METRIC_NAME}" "gauge" "GPU device state code (0/1 ~ available free/unavailable used)."
echo "${GPU_DEVICES}" | \
while read i_gpu_device ; do
[ -z "${i_gpu_device}" ] && \
continue
i_gpu_device_location="$(echo "${i_gpu_device}" | awk '{printf $1}')"
i_gpu_device_vendor="$(echo "${i_gpu_device}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{printf $2}' | tr -d '"')"
i_gpu_device_model="$(echo "${i_gpu_device}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{printf $3}' | tr -d '"')"
i_gpu_device_revision="$(echo "${i_gpu_device}" | grep -Eo -- "-r[^ \t]+")"
i_gpu_attached=0
i_vm_domain_name=""
if lspci -v -s "${i_gpu_device_location}" | grep -qE 'Kernel driver in use: .+'; then
i_gpu_attached=1
i_vm_domain_file="$(find_gpu_passthough_vm_manifest "${i_gpu_device_location}")"
i_vm_domain_name="$(basename $(echo "${i_vm_domain_file}" | head -1) | sed 's/.xml//g')"
fi
printf '%s{device="%s",vendor="%s",location="%s",revision="%s",domain="%s"} %d\n' "${METRIC_NAME}" \
"${i_gpu_device_model}" "${i_gpu_device_vendor}" "${i_gpu_device_location}" \
"${i_gpu_device_revision:2}" "${i_vm_domain_name}" "${i_gpu_attached}"
done
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment