Skip to content
Snippets Groups Projects
Commit a2abe5a0 authored by František Řezníček's avatar František Řezníček
Browse files

Merge branch 'initial-release' into 'master'

feat: initial commit

See merge request !1
parents 2e65c1f0 e37df3ea
No related branches found
No related tags found
1 merge request!1feat: initial commit
Pipeline #92930 passed
stages:
- build
- build-image
- release-image
image: registry.gitlab.ics.muni.cz:443/cloud/container-registry/docker:latest
services:
- name: registry.gitlab.ics.muni.cz:443/cloud/container-registry/docker:latest-dind
alias: docker
variables:
DOCKER_EXE: docker
before_script:
- apk update
- apk add $(cat ci/dependencies.apk.txt)
- ${DOCKER_EXE} login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" "${CI_REGISTRY}"
- export VERSION=$(ci/version.sh CHANGELOG.md)
# build and release container image
# ---------------------------------------------------------------------------
build-image:
stage: build-image
script: |
echo "${CI_REGISTRY_IMAGE}:${CI_COMMIT_SHORT_SHA}" > container-release.uri
tar czf custom-metrics-generator_files.tgz entrypoint.sh dependencies*.txt src/*.sh src/*/*.sh CHANGELOG.md Dockerfile
${DOCKER_EXE} build --pull -t $(head -1 container-release.uri) --build-arg "VERSION=${VERSION}" --build-arg "BUILD_DATE=$(date +%Y-%m-%dT%H:%M:%S)" --build-arg "CI_BUILD_HOSTNAME=$(hostname)" --build-arg "CI_COMMIT_SHA=${CI_COMMIT_SHA}" --build-arg "CI_BUILD_JOB_NAME=${CI_JOB_NAME}" --build-arg "CI_BUILD_ID=${CI_JOB_ID}" .
${DOCKER_EXE} push $(head -1 container-release.uri)
artifacts:
expire_in: 2 mo
name: ${CI_BUILD_NAME}-${CI_BUILD_REF_NAME}-container-uri
paths:
- container-release.uri
# release container image
# ---------------------------------------------------------------------------
release-image:
stage: release-image
script: |
CONTAINER_IMAGE="$(head -1 container-release.uri)"
${DOCKER_EXE} pull "${CONTAINER_IMAGE}"
for i_container_image_tag in ${VERSION} latest; do
${DOCKER_EXE} tag "${CONTAINER_IMAGE}" ${CI_REGISTRY_IMAGE}:${i_container_image_tag}
${DOCKER_EXE} push "${CI_REGISTRY_IMAGE}:${i_container_image_tag}"
echo "${CI_REGISTRY_IMAGE}:${i_container_image_tag}" >> container-release.uri
done
only:
- /^v[0-9]+\.[0-9]+\.[0-9]+/
except:
- branches
artifacts:
expire_in: 2 mo
name: ${CI_BUILD_NAME}-${CI_BUILD_REF_NAME}-container-release-uri
paths:
- container-release.uri
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [1.0.0] - 2021-06-29
### Added
- Initial release
FROM centos:7
ARG VERSION=unknown-version
ARG BUILD_DATE=unknown-date
ARG CI_COMMIT_SHA=unknown
ARG CI_BUILD_HOSTNAME
ARG CI_BUILD_JOB_NAME
ARG CI_BUILD_ID
ARG YQ_BINARY_BASEURL="https://github.com/mikefarah/yq/releases/download/v4.9.6"
ARG YQ_BINARY_URL="${YQ_BINARY_BASEURL}/yq_linux_amd64"
ARG YQ_CHECKSUMS_URL="${YQ_BINARY_BASEURL}/checksums"
COPY dependencies.yum.txt /tmp
RUN yum -y install epel-release && \
yum -y update && \
yum -y install $(cat /tmp/dependencies.yum.txt) && \
curl -L "${YQ_BINARY_URL}" --output "/tmp/$(basename "${YQ_BINARY_URL}")" && \
curl -L "${YQ_CHECKSUMS_URL}" --output "/tmp/$(basename "${YQ_CHECKSUMS_URL}")" && \
bash -xc 'cd /tmp ; f="$(basename ${YQ_BINARY_URL})" ; chsum="$(sha256sum "${f}" | awk "{print \$1}")" ; grep -F "${f}" checksums | grep -F " ${chsum}"' && \
mv -f "/tmp/$(basename "${YQ_BINARY_URL}")" /usr/local/bin/yq && \
chmod +x /usr/local/bin/yq && \
rm -f "/tmp/$(basename "${YQ_CHECKSUMS_URL}")" "/tmp/dependencies.yum.txt" && \
yum clean all && \
mkdir -p /opt/custom-metrics-generator
ADD custom-metrics-generator_files.tgz /opt/custom-metrics-generator/
ENTRYPOINT ["/opt/custom-metrics-generator/entrypoint.sh"]
LABEL maintainer="MetaCentrum Cloud Team <cloud[at]ics.muni.cz>" \
org.label-schema.schema-version="1.0.0-rc.1" \
org.label-schema.vendor="Masaryk University, ICS" \
org.label-schema.name="custom-metrics-generator" \
org.label-schema.version="$VERSION" \
org.label-schema.build-date="$BUILD_DATE" \
org.label-schema.build-ci-job-name="$CI_BUILD_JOB_NAME" \
org.label-schema.build-ci-build-id="$CI_BUILD_ID" \
org.label-schema.build-ci-host-name="$CI_BUILD_HOSTNAME" \
org.label-schema.url="https://gitlab.ics.muni.cz/cloud/custom-metrics-generator" \
org.label-schema.vcs-url="https://gitlab.ics.muni.cz/cloud/custom-metrics-generator" \
org.label-schema.vcs-ref="$CI_COMMIT_SHA"
bash
gawk
#!/usr/bin/env bash
# Get project version
# Usage:
# * get-version <changelog-version-file>
CHANGELOG_FILE="$1"
changelog_version=$(grep -E '^##[ \t]+\[.+\]' "${CHANGELOG_FILE}" | \
awk '{print substr($2,2,length($2)-2)}' | grep -v '[Uu]nreleased' | head -1)
if [[ "${CI_COMMIT_TAG}" =~ ^v?[0-9]+\.[0-9]+\.[0-9]+ ]]; then
if [ "${CI_COMMIT_TAG}" != "${changelog_version}" -a "${CI_COMMIT_TAG}" != "v${changelog_version}" ]; then
echo "Cannot get project version as tag claims version ${CI_COMMIT_TAG} but changelog version is ${changelog_version}"
exit 1
fi
echo "${changelog_version}"
else
echo "${changelog_version}_${CI_COMMIT_SHA}_${CI_PIPELINE_ID}"
fi
jq
bash
gawk
docker-client
curl
cronie
smartmontools
#!/usr/bin/env bash
set -eo pipefail
[[ "${CMG_EP_TRACE}" =~ ^1|[Tt]rue$ ]] && \
set -x
# constants
CMG_BASE_DIR=$(dirname $(readlink -f $0))
CMG_MAX_JITTER_DELAY="${CMG_MAX_JITTER_DELAY:-"3"}"
CMG_OUT_METRIC_DIR="${CMG_OUT_METRIC_DIR:-"${CMG_BASE_DIR}/out-metric-dir"}"
CMG_STDOUT_LOG="${CMG_STDOUT_LOG:-"${CMG_BASE_DIR}/custom-metrics-generator.stdout.log"}"
CMG_STDERR_LOG="${CMG_STDERR_LOG:-"${CMG_BASE_DIR}/custom-metrics-generator.stderr.log"}"
CMG_EXEC_MODULES="${CMG_EXEC_MODULES:-"smartmon dockermon puppetmon noderolemon"}"
CMG_NODEROLEMON_METRICS_TEXTFILE=${CMG_NODEROLEMON_METRICS_TEXTFILE:-"/etc/node-exporter/node-role.prom"}
# load the library
source ${CMG_BASE_DIR}/src/lib.sh
# create logs if missing and rotate them
touch ${CMG_STDOUT_LOG} ${CMG_STDERR_LOG}
rotate_file ${CMG_STDOUT_LOG} &
rotate_file ${CMG_STDERR_LOG} &
# create out directory (if not present)
test -d "${CMG_OUT_METRIC_DIR}" || \
mkdir -p "${CMG_OUT_METRIC_DIR}"
i_fixed_sleep=0
for i_module in ${CMG_EXEC_MODULES}; do
touch "${CMG_OUT_METRIC_DIR}/${i_module}.prom" \
"${CMG_OUT_METRIC_DIR}/custom-metrics-generator_${i_module}.prom" > /dev/null
echo "* * * * * ${CMG_BASE_DIR}/src/delay-jitter-exec.sh ${i_fixed_sleep} ${CMG_MAX_JITTER_DELAY} \
${CMG_BASE_DIR}/src/metric-generator-exec.sh ${i_module} ${CMG_OUT_METRIC_DIR}/${i_module}.prom ${CMG_OUT_METRIC_DIR}/custom-metrics-generator_${i_module}.prom"
i_fixed_sleep=$(( ${i_fixed_sleep} + 5 ))
done > /var/spool/cron/root
echo "custom-metrics-generator configuration:"
set | grep -E '^CMG_' | awk '{print "export " $0}' > "${CMG_BASE_DIR}/custom-metrics-generator.conf.env"
awk '{$1=" ";print}' "${CMG_BASE_DIR}/custom-metrics-generator.conf.env"
# attach log files to container's stdout and stderr
tail -F --max-unchanged-stats=10 ${CMG_STDOUT_LOG} &
tail -F --max-unchanged-stats=10 ${CMG_STDERR_LOG} 1>&2 &
# schedule periodic actions with cron
exec crond -n
#!/usr/bin/env bash
# delay-jitter-exec.sh
#
# Usage:
# delay-jitter-exec.sh <fixed-delay-secs> <max-jitter-secs> <cmd> [cmd-arguments]
#
# Execute specified command <cmd> [cmd-arguments]
# with the
# * fixed delay <fixed-delay-secs>
# * jitter delay <max-jitter-secs> < 0, <max-jitter-secs> >
#
# Examples:
# # launch `smartmon.sh /dev/sda` after delay which is in range of <0.5 + 0, 0.5 + 1.15> seconds
# $ jitter-exec.sh 0.5 1.15 smartmon.sh /dev/sda
fixed_delay_secs=${1:-0}
max_jitter_secs=${2:-0}
[[ "${fixed_delay_secs}" =~ ^[0-9]+\.?([0-9]+)?|$ ]] || \
fixed_delay_secs=0
[[ "${max_jitter_secs}" =~ ^[0-9]+\.?([0-9]+)?|$ ]] || \
max_jitter=0
[ "${fixed_delay_secs}" != "0" ] && \
sleep "${fixed_delay_secs}"
[ "${max_jitter_secs}" != "0" ] && \
awk -v "rnd=${RANDOM}" -v "max_jitter_secs=${max_jitter_secs}" \
'BEGIN{system(sprintf("sleep %f", rnd * max_jitter_secs / 32767.0))}'
shift 2
# execute <cmd>
"$@"
# get_metric_help ( <metric-name> <metric-type> <metric-desc>)
# generates testform metric help
function get_metric_help() {
local metric_name="$1"
local metric_type="$2"
local metric_desc="$3"
echo "# HELP ${metric_name} ${metric_desc}"
echo "# TYPE ${metric_name} ${metric_type}"
}
# rotate_file ( <log-path> [file-max-size] )
# rotate file <log-path>
function rotate_file() {
local file_path="$1"
local max_file_size="${2:-$((10 *1024 * 1024))}"
while true; do
local i_time="$(date +%Y%m%dT%H%M%S)"
local i_file_size=$(stat --format=%s "${file_path}")
if (( ${i_file_size} > ${max_file_size} )); then
rm -f $1.rotated.*
cat "${file_path}" > "${file_path}.rotated.${i_time}"
truncate --size=0 "${file_path}"
log_stdout "${file_path} rotated to ${file_path}.rotated.${i_time} (size was ${i_file_size} > ${max_file_size})"
fi
sleep 30
done
}
# __log ( <file> <msg> [msg] ...)
# logs message to stdout log file
function __log() {
local log_file="$1"
shift
local msg="[$(date '+%Y-%m-%d %H:%M:%S.%N')] $@"
if [ -f "${log_file}" ]; then
echo "${msg}" >> "${log_file}"
else
echo "${msg}" 1>&2
fi
}
# log_stdout ( <msg> [msg] ...)
# logs message to stdout log file
function log_stdout() {
__log "${CMG_STDOUT_LOG}" "$@"
}
# log_stderr ( <msg> [msg] ...)
# logs message to stdout log file
function log_stderr() {
__log "${CMG_STDERR_LOG}" "$@"
}
#!/usr/bin/env bash
# metric-generator-exec.sh
#
# Usage:
# metric-generator-exec.sh <metric-generator-module-name> <metrics-generator-output-textfile-metric-file> <runner-output-textfile-metric-file>
#
# Executes specified metric generator module <metric-generator-module-name>
# writes following text metric files:
# * <metrics-generator-output-textfile-metric-file> generated by the <metric-generator-module-name>
# * <runner-output-textfile-metric-file> generated by this runner (measuring <metric-generator-module-name> is successfully executed)
#
# Examples:
# # launch `smartmon` metric generator module
# $ metric-generator-exec.sh smartmon /tmp/smartmon.prom /tmp/cmg-smartmon.prom
set -eo pipefail
# arguments
METRICS_GENERATOR_NAME="$1"
OUTPUT_METRIC_GENERATOR_METRIC_FILE="$2"
OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE="$3"
# constants
CMG_SRC_DIR=$(dirname $(readlink -f $0))
CMG_METRIC_GENERATOR_EXEC_TIMEOUT="40"
METRICS_GENERATOR_FILE="$(ls ${CMG_SRC_DIR}/metric-generators/${METRICS_GENERATOR_NAME}* | head -1)"
GENERATOR_START_TIME=$(date +%s)
STAGE_NAME="configuration loaded"
source "${CMG_SRC_DIR}/../custom-metrics-generator.conf.env"
STAGE_NAME="library loaded"
source "${CMG_SRC_DIR}/lib.sh"
# get_metric_text( <metric-name> <metric-generator-name> <metric-timestap-value> )
# generates metric text
function get_metric_text() {
local metric_name="$1"
local metric_generator_name="$2"
local metric_timestamp_value="$3"
if [ "${metric_name}" == "job_last_run_timestamp" ]; then
get_metric_help "${metric_name}" "gauge" "Last job execution timestamp."
printf '%s{app="%s",generator="%s"} %d\n' "${metric_name}" "custom-metrics-generator" "${metric_generator_name}" "${metric_timestamp_value}"
elif [ "${metric_name}" == "job_last_successful_run_timestamp" ]; then
get_metric_help "${metric_name}" "gauge" "Last successful job execution timestamp."
printf '%s{app="%s",generator="%s"} %d\n' "${metric_name}" "custom-metrics-generator" "${metric_generator_name}" "${metric_timestamp_value}"
else
log_stderr "ERROR: get_metric_text() does not know how to generate ${metric_name} metric"
false
fi
}
# at_exit()
# callback procedure executed at exit (trap)
function at_exit() {
set +e
# handling partial <runner-output-textfile-metric-file> data
if [ -e "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}.tmp" ]; then
if ! grep -q "^job_last_run_timestamp" "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}.tmp"; then
if grep -q "^job_last_run_timestamp" "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}"; then
grep "job_last_run_timestamp" "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}" >> "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}.tmp"
else
get_metric_text "job_last_run_timestamp" "${METRICS_GENERATOR_NAME}" 0 >> "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}.tmp"
fi
else
if ! grep -q "^job_last_successful_run_timestamp" "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}.tmp"; then
if grep -q "^job_last_successful_run_timestamp" "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}"; then
grep "job_last_successful_run_timestamp" "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}" >> "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}.tmp"
else
get_metric_text "job_last_successful_run_timestamp" "${METRICS_GENERATOR_NAME}" 0 >> "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}.tmp"
fi
fi
fi
mv -f "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}.tmp" "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}"
fi
local env_dump_file="/tmp/cmg-${METRICS_GENERATOR_NAME}-trace.log"
set > ${env_dump_file}
if [ "${STAGE_NAME}" != "success" ]; then
log_stderr "ERROR: Job ${METRICS_GENERATOR_NAME} failed at step \"${STAGE_NAME}\" (PPID: $$, duration: ${SECONDS} sec[s], see env. dump at ${env_dump_file})."
exit 1
fi
log_stdout "INFO: Job ${METRICS_GENERATOR_NAME} succeeded. (PPID: $$, duration: ${SECONDS} sec[s])"
}
trap at_exit EXIT
log_stdout "INFO: Job ${METRICS_GENERATOR_NAME} started. (PPID: $$)"
STAGE_NAME="job_last_run_timestamp metric generated"
rm -f "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}.tmp"
get_metric_text "job_last_run_timestamp" "${METRICS_GENERATOR_NAME}" "${GENERATOR_START_TIME}" >> "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}.tmp"
STAGE_NAME="metric-generator successfuly executed"
timeout ${CMG_METRIC_GENERATOR_EXEC_TIMEOUT} ${METRICS_GENERATOR_FILE} ${OUTPUT_METRIC_GENERATOR_METRIC_FILE} > "${OUTPUT_METRIC_GENERATOR_METRIC_FILE}.tmp"
STAGE_NAME="metric-generator generated valid metrics"
test -s "${OUTPUT_METRIC_GENERATOR_METRIC_FILE}.tmp"
STAGE_NAME="metric-generator atomicly wrote generated metrics"
mv -f "${OUTPUT_METRIC_GENERATOR_METRIC_FILE}.tmp" "${OUTPUT_METRIC_GENERATOR_METRIC_FILE}"
STAGE_NAME="job_last_run_timestamp metric generated"
GENERATOR_END_TIME=$(date +%s)
get_metric_text "job_last_successful_run_timestamp" "${METRICS_GENERATOR_NAME}" "${GENERATOR_END_TIME}" >> "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}.tmp"
mv -f "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}.tmp" "${OUTPUT_METRIC_GENERATOR_RUNNER_METRIC_FILE}"
STAGE_NAME="success"
#!/usr/bin/env bash
# prometheus textfile metrics generator for node-exporter textfile collector
# Monitors: docker container restarts
# cAdvisor does not provide container number of restarts nicely (conditionally in label only)
set -eo pipefail
METRIC_NAME="dockermon_container_restarts_total"
CONTAINER_IDS=()
CONTAINER_NAMES=()
CONTAINER_RESTART_COUNTS=()
CONTAINERS=$(docker ps -a -q)
for i_container in ${CONTAINERS}; do
i_container_status="$(docker inspect "${i_container}")"
i_container_name="$(echo "${i_container_status}" | jq -r '.[].Name')"
i_container_restart_count="$(echo "${i_container_status}" | jq -r '.[].RestartCount')"
i_container_id="$(echo "${i_container_status}" | jq -r '.[].Id')"
test -n "${i_container_name}"
echo -n "${i_container_restart_count}" | grep -Eq "^[0-9]+$"
echo -n "${i_container_id}" | grep -Eq "^[a-fA-F0-9]{12,}$"
CONTAINER_IDS+=("${i_container_id}")
CONTAINER_NAMES+=("${i_container_name}")
CONTAINER_RESTART_COUNTS+=("${i_container_restart_count}")
done
printf 'HELP %s Number of container restarts.\n' "${METRIC_NAME}"
printf 'TYPE %s counter\n' "${METRIC_NAME}"
for ((indx=0; indx<${#CONTAINER_IDS[*]}; indx++)); do
printf '%s{id="%s",name="%s"} %d\n' "${METRIC_NAME}" "${CONTAINER_IDS[${indx}]}" \
"${CONTAINER_NAMES[${indx}]}" "${CONTAINER_RESTART_COUNTS[${indx}]}"
done
#!/usr/bin/env bash
set -e
test -s "${CMG_NODEROLEMON_METRICS_TEXTFILE}"
cat "${CMG_NODEROLEMON_METRICS_TEXTFILE}"
#!/usr/bin/env bash
# prometheus textfile metrics generator for node-exporter textfile collector
# Monitors: puppet host management state
# https://puppet.com/blog/puppet-monitoring-how-to-monitor-success-or-failure-of-puppet-runs/
#
# Usage: puppetmon.sh <last-known-generated-metric-file>
set -eo pipefail
# constants
PUPPET_STATE_FILE="/var/lib/puppet/state/last_run_summary.yaml"
YQ="/usr/local/bin/yq"
CMG_SRC_DIR=$(dirname $(dirname $(readlink -f $0)))
PUPPETMON_GENERATED_METRIC_FILE="$1"
STAGE_NAME="configuration loaded"
source "${CMG_SRC_DIR}/../custom-metrics-generator.conf.env"
STAGE_NAME="library loaded"
source "${CMG_SRC_DIR}/lib.sh"
STAGE_NAME="puppet state file is valid"
puppet_state_file_valid=0
${YQ} eval . "${PUPPET_STATE_FILE}" > /dev/null && \
puppet_state_file_valid=1
puppet_version=$(${YQ} eval ".version.puppet" "${PUPPET_STATE_FILE}")
echo -n "${puppet_version}" | grep -Eq "^[0-9]+\.[0-9]+\.[0-9]+"
puppet_time_last_run=$(${YQ} eval ".time.last_run" "${PUPPET_STATE_FILE}")
echo -n "${puppet_time_last_run}" | grep -Eq "^[0-9]+$"
puppet_events_failure_count=$(${YQ} eval ".events.failure" "${PUPPET_STATE_FILE}")
echo -n "${puppet_events_failure_count}" | grep -Eq "^[0-9]+$"
STAGE_NAME="puppet state file vality metric is generated"
METRIC_NAME="puppetmon_puppet_statefile_valid"
get_metric_help "${METRIC_NAME}" "gauge" "Puppet state file validity flag."
printf '%s %d\n' "${METRIC_NAME}" "${puppet_state_file_valid}"
STAGE_NAME="puppet version metric is generated"
METRIC_NAME="puppetmon_puppet_version"
get_metric_help "${METRIC_NAME}" "gauge" "Puppet version."
printf '%s{version="%s"} 1\n' "${METRIC_NAME}" "${puppet_version}"
STAGE_NAME="puppet last run timestamp metric is generated"
METRIC_NAME="job_last_run_timestamp"
get_metric_help "${METRIC_NAME}" "gauge" "Last job execution timestamp."
printf '%s{app="puppetmon"} %d\n' "${METRIC_NAME}" "${puppet_time_last_run}"
STAGE_NAME="puppet last successful run timestamp metric is generated"
METRIC_NAME="job_last_successful_run_timestamp"
get_metric_help "${METRIC_NAME}" "gauge" "Last successful job execution timestamp."
if [ "${puppet_events_failure_count}" == "0" ]; then
printf '%s{app="puppetmon"} %d\n' "${METRIC_NAME}" "${puppet_time_last_run}"
else
if grep -q "^${METRIC_NAME}" "${PUPPETMON_GENERATED_METRIC_FILE}"; then
printf '%s{app="puppetmon"} %d\n' "${METRIC_NAME}" "$(grep "^${METRIC_NAME}" "${PUPPETMON_GENERATED_METRIC_FILE}" | head -1 | awk '{print $NF}')"
else
printf '%s{app="puppetmon"} %d\n' "${METRIC_NAME}" "0"
fi
fi
#!/usr/bin/env bash
# Script informed by the collectd monitoring script for smartmontools (using smartctl)
# by Samuel B. <samuel_._behan_(at)_dob_._sk> (c) 2012
# source at: http://devel.dob.sk/collectd-scripts/
# TODO: This probably needs to be a little more complex. The raw numbers can have more
# data in them than you'd think.
# http://arstechnica.com/civis/viewtopic.php?p=22062211
# Formatting done via shfmt -i 2
# https://github.com/mvdan/sh
parse_smartctl_attributes_awk="$(
cat <<'SMARTCTLAWK'
$1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ {
gsub(/-/, "_");
printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4
printf "%s_worst{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $5
printf "%s_threshold{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $6
printf "%s_raw_value{%s,smart_id=\"%s\"} %e\n", $2, labels, $1, $10
}
SMARTCTLAWK
)"
smartmon_attrs="$(
cat <<'SMARTMONATTRS'
airflow_temperature_cel
command_timeout
current_pending_sector
end_to_end_error
erase_fail_count
g_sense_error_rate
hardware_ecc_recovered
host_reads_32mib
host_reads_mib
host_writes_32mib
host_writes_mib
load_cycle_count
media_wearout_indicator
nand_writes_1gib
offline_uncorrectable
power_cycle_count
power_on_hours
program_fail_cnt_total
program_fail_count
raw_read_error_rate
reallocated_event_count
reallocated_sector_ct
reported_uncorrect
runtime_bad_block
sata_downshift_count
seek_error_rate
spin_retry_count
spin_up_time
start_stop_count
temperature_case
temperature_celsius
temperature_internal
total_lbas_read
total_lbas_written
udma_crc_error_count
unsafe_shutdown_count
unused_rsvd_blk_cnt_tot
wear_leveling_count
workld_host_reads_perc
workld_media_wear_indic
workload_minutes
SMARTMONATTRS
)"
smartmon_attrs="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')"
parse_smartctl_attributes() {
local disk="$1"
local disk_type="$2"
local labels="disk=\"${disk}\",type=\"${disk_type}\""
sed 's/^ \+//g' |
awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null |
tr '[:upper:]' '[:lower:]' |
grep -E "(${smartmon_attrs})"
}
parse_smartctl_scsi_attributes() {
local disk="$1"
local disk_type="$2"
local labels="disk=\"${disk}\",type=\"${disk_type}\""
while read -r line; do
attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')"
attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')"
case "${attr_type}" in
number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;;
Current_Drive_Temperature) temp_cel="$(echo "${attr_value}" | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;;
Blocks_sent_to_initiator_) lbas_read="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;;
Blocks_received_from_initiator_) lbas_written="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;;
Accumulated_start-stop_cycles) power_cycle="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;;
Elements_in_grown_defect_list) grown_defects="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;;
esac
done
[ -n "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}"
[ -n "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}"
[ -n "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}"
[ -n "$lbas_written" ] && echo "total_lbas_written_raw_value{${labels},smart_id=\"242\"} ${lbas_written}"
[ -n "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}"
[ -n "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"12\"} ${grown_defects}"
}
parse_smartctl_info() {
local -i smart_available=0 smart_enabled=0 smart_healthy=
local disk="$1" disk_type="$2"
local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id=''
while read -r line; do
info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')"
info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')"
case "${info_type}" in
Model_Family) model_family="${info_value}" ;;
Device_Model) device_model="${info_value}" ;;
Serial_Number) serial_number="${info_value}" ;;
Firmware_Version) fw_version="${info_value}" ;;
Vendor) vendor="${info_value}" ;;
Product) product="${info_value}" ;;
Revision) revision="${info_value}" ;;
Logical_Unit_id) lun_id="${info_value}" ;;
esac
if [[ "${info_type}" == 'SMART_support_is' ]]; then
case "${info_value:0:7}" in
Enabled) smart_available=1; smart_enabled=1 ;;
Availab) smart_available=1; smart_enabled=0 ;;
Unavail) smart_available=0; smart_enabled=0 ;;
esac
fi
if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then
case "${info_value:0:6}" in
PASSED) smart_healthy=1 ;;
*) smart_healthy=0 ;;
esac
elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then
case "${info_value:0:2}" in
OK) smart_healthy=1 ;;
*) smart_healthy=0 ;;
esac
fi
done
echo "device_info{disk=\"${disk}\",type=\"${disk_type}\",vendor=\"${vendor}\",product=\"${product}\",revision=\"${revision}\",lun_id=\"${lun_id}\",model_family=\"${model_family}\",device_model=\"${device_model}\",serial_number=\"${serial_number}\",firmware_version=\"${fw_version}\"} 1"
echo "device_smart_available{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_available}"
echo "device_smart_enabled{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_enabled}"
[[ "${smart_healthy}" != "" ]] && echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_healthy}"
}
output_format_awk="$(
cat <<'OUTPUTAWK'
BEGIN { v = "" }
v != $1 {
print "# HELP smartmon_" $1 " SMART metric " $1;
print "# TYPE smartmon_" $1 " gauge";
v = $1
}
{print "smartmon_" $0}
OUTPUTAWK
)"
format_output() {
sort |
awk -F'{' "${output_format_awk}"
}
smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')"
echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output
if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then
exit
fi
device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')"
for device in ${device_list}; do
disk="$(echo "${device}" | cut -f1 -d'|')"
type="$(echo "${device}" | cut -f2 -d'|')"
active=1
echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')"
# Check if the device is in a low-power mode
/usr/sbin/smartctl -n standby -d "${type}" "${disk}" > /dev/null || active=0
echo "device_active{disk=\"${disk}\",type=\"${type}\"}" "${active}"
# Skip further metrics to prevent the disk from spinning up
test ${active} -eq 0 && continue
# Get the SMART information and health
/usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}"
# Get the SMART attributes
case ${type} in
sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;;
sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;;
scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;;
megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;;
nvme*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;;
*)
(>&2 echo "disk type is not sat, scsi, nvme or megaraid but ${type}")
exit
;;
esac
done | format_output
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment