diff --git a/CHANGELOG.md b/CHANGELOG.md index de1fc4feef7dbec5640a121c283c4bd22490fb1c..f2472161744f7f872eeec271e5a21edfd60a615d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [1.2.0] - 2022-06-07 +### Added +- GPU presence and availablility added as gpumon metric generator + ## [1.1.1] - 2022-05-03 ### Fixed - Fix chmod for gatewaypingmon diff --git a/dependencies.yum.txt b/dependencies.yum.txt index 4badcc2275ed6ee32ede56d2d3023952765ac1a7..1db6d4d73c4f82221dc3e08ae252e9233d62637f 100644 --- a/dependencies.yum.txt +++ b/dependencies.yum.txt @@ -5,3 +5,4 @@ docker-client curl cronie smartmontools +pciutils diff --git a/entrypoint.sh b/entrypoint.sh index 135ac1bcb314f1912ee1bdda2afabdfc67fba643..31c81a77dc321bed4fc2e2b38c617f8dbdbfd3f3 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -11,7 +11,7 @@ CMG_MAX_JITTER_DELAY="${CMG_MAX_JITTER_DELAY:-"3"}" CMG_OUT_METRICS_DIR="${CMG_OUT_METRICS_DIR:-"${CMG_BASE_DIR}/out-metrics-dir"}" CMG_STDOUT_LOG="${CMG_STDOUT_LOG:-"${CMG_BASE_DIR}/custom-metrics-generator.stdout.log"}" CMG_STDERR_LOG="${CMG_STDERR_LOG:-"${CMG_BASE_DIR}/custom-metrics-generator.stderr.log"}" -CMG_EXEC_MODULES="${CMG_EXEC_MODULES:-"smartmon dockermon puppetmon noderolemon gatewaypingmon"}" +CMG_EXEC_MODULES="${CMG_EXEC_MODULES:-"smartmon dockermon puppetmon noderolemon gatewaypingmon gpumon"}" CMG_NODEROLEMON_METRICS_TEXTFILE=${CMG_NODEROLEMON_METRICS_TEXTFILE:-"/etc/node-exporter/node-role.prom"} # load the library diff --git a/src/metric-generators/gpumon.sh b/src/metric-generators/gpumon.sh new file mode 100755 index 0000000000000000000000000000000000000000..efff12e1e28ebdc53c43dca68a19f930fb939373 --- /dev/null +++ b/src/metric-generators/gpumon.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +# prometheus textfile metrics generator for node-exporter textfile collector +# Monitors: gpu device status + +set -eo pipefail + +CMG_SRC_DIR=$(dirname $(dirname $(readlink -f $0))) + +STAGE_NAME="configuration loaded" +source "${CMG_SRC_DIR}/../custom-metrics-generator.conf.env" + +STAGE_NAME="library loaded" +source "${CMG_SRC_DIR}/lib.sh" + +# local functions +# get_gpu_devices [lspci-additional-args] +# list PCI devices one per line +function get_gpu_devices() { + lspci -D -mm "$@" | grep -i nvidia | grep -Ei "(VGA|2D|3D).+controller" +} + +# find_gpu_passthough_vm_manifest <gpu-location> [libvirtd-qemu-dir] +function find_gpu_passthough_vm_manifest () { + local gpu_location="$1" + local dir="${2:-"/etc/libvirt/qemu"}" + + local gpu_domain=$(echo -n "${gpu_location}" | sed 's/[:.]/ /g' | awk '{printf $1}') + local gpu_bus=$(echo -n "${gpu_location}" | sed 's/[:.]/ /g' | awk '{printf $2}') + local gpu_slot=$(echo -n "${gpu_location}" | sed 's/[:.]/ /g' | awk '{printf $3}') + local gpu_function=$(echo -n "${gpu_location}" | sed 's/[:.]/ /g' | awk '{printf $4}') + + test -d ${dir} || return 0 + + for i_gpu_manifest_file in $(ls "${dir}"/*.xml); do + if grep -i "domain=['\"]0x${gpu_domain}['\"]" "${i_gpu_manifest_file}" | \ + grep -i "bus=['\"]0x${gpu_bus}['\"]" | \ + grep -i "slot=['\"]0x${gpu_slot}['\"]" | \ + grep -iq "function=['\"]0x${gpu_function}['\"]"; then + echo "${i_gpu_manifest_file}" + fi + done +} + +# gather all GPU cards (assure device model name is detected) +STAGE_NAME="GPU cards successfully detected" +GPU_DEVICES="$(get_gpu_devices)" +GPU_DEVICES_UNRECOGNIZED="$(echo "${GPU_DEVICES}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{print $3}' | \ + sort -u | tr -d '"' | grep -Ev '.+[ \t]\[.+\]')" +if [ -n "${GPU_DEVICES_UNRECOGNIZED}" ]; then + # reload GPU devices and resolve proper device names with internet PCI id database + GPU_DEVICES="$(get_gpu_devices -q)" +fi + +# browse the GPUs and export metrics +METRIC_NAME="gpumon_device_state_code" +STAGE_NAME="GPU cards ${METRIC_NAME} metrics generated" +get_metric_help "${METRIC_NAME}" "gauge" "GPU device state code (0/1 ~ available free/unavailable used)." +echo "${GPU_DEVICES}" | \ + while read i_gpu_device ; do + [ -z "${i_gpu_device}" ] && \ + continue + i_gpu_device_location="$(echo "${i_gpu_device}" | awk '{printf $1}')" + i_gpu_device_vendor="$(echo "${i_gpu_device}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{printf $2}' | tr -d '"')" + i_gpu_device_model="$(echo "${i_gpu_device}" | gawk 'BEGIN{FPAT = "(\"[^\"]+\")"}{printf $3}' | tr -d '"')" + i_gpu_device_revision="$(echo "${i_gpu_device}" | grep -Eo -- "-r[^ \t]+")" + i_gpu_attached=0 + i_vm_domain_name="" + if lspci -v -s "${i_gpu_device_location}" | grep -qE 'Kernel driver in use: .+'; then + i_gpu_attached=1 + i_vm_domain_file="$(find_gpu_passthough_vm_manifest "${i_gpu_device_location}")" + i_vm_domain_name="$(basename $(echo "${i_vm_domain_file}" | head -1) | sed 's/.xml//g')" + fi + printf '%s{device="%s",vendor="%s",location="%s",revision="%s",domain="%s"} %d\n' "${METRIC_NAME}" \ + "${i_gpu_device_model}" "${i_gpu_device_vendor}" "${i_gpu_device_location}" \ + "${i_gpu_device_revision:2}" "${i_vm_domain_name}" "${i_gpu_attached}" + done