From af7dc067a7174cdb0f5a55813a968629e8679658 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ji=C5=99=C3=AD=20Prokop?= <jprokop.ofc@gmail.com>
Date: Tue, 2 Jul 2024 15:05:13 +0200
Subject: [PATCH] feat: run_probes - add timeout for each check

BREAKING CHANGE: config structure change + new required option
---
 README.md                            |   2 +
 config_templates/run_probes_cfg.yaml | 115 ++++++++++++++-------------
 perun/proxy/utils/run_probes.py      |  25 ++++--
 3 files changed, 80 insertions(+), 62 deletions(-)

diff --git a/README.md b/README.md
index 3de4ff9..99274e1 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,8 @@ There are several extras which are required only for some scripts:
 - script designed to execute multiple monitoring probes
 - output is compatible with CheckMK
 - it is required to put configuration file to `/etc/run_probes_cfg.yaml`
+- default timeout in seconds for all checks is set by `default_timeout` in the config, and each check can optionally have its own `timeout` setting overriding the default one.
+
 
 For usage instructions, run:
 
diff --git a/config_templates/run_probes_cfg.yaml b/config_templates/run_probes_cfg.yaml
index cd972df..d8099f7 100644
--- a/config_templates/run_probes_cfg.yaml
+++ b/config_templates/run_probes_cfg.yaml
@@ -1,60 +1,63 @@
-check_mongodb:
-  # module with checks
-  module: perun.proxy.utils.nagios.check_mongodb
-  check_mongodb_shared: &check_mongodb_shared
-    host: "hostname"
-    u: "username"
-    p: "password"
-    tls: true
-    tls-ca-file: "/etc/ssl/chain.crt"
-    tls-cert-key-file: "/etc/ssl/certificate_and_key.pem"
-  runs:
-    # check with parameter
-    check_mongodb_connect:
-      <<: *check_mongodb_shared
-      A: connect
-      W: 2
-      C: 4
-    check_mongodb_connections:
-      <<: *check_mongodb_shared
-      A: connections
-      W: 70
-      C: 80
-    check_mongodb_replication_lag:
-      <<: *check_mongodb_shared
-      A: replication_lag
-      W: 15
-      C: 30
-    check_mongodb_replset_state:
-      <<: *check_mongodb_shared
-      A: replset_state
-      W: 0
-      C: 0
-
-check_rpc_status:
-  module: perun.proxy.utils.nagios.check_rpc_status
-  runs:
-    check_rpc_status:
+default_timeout: 30 # in seconds
+checks:
+  check_mongodb:
+    # module with checks
+    module: perun.proxy.utils.nagios.check_mongodb
+    check_mongodb_shared: &check_mongodb_shared
+      host: "hostname"
       u: "username"
       p: "password"
-      d: "domain"
-      i: 1
+      tls: true
+      tls-ca-file: "/etc/ssl/chain.crt"
+      tls-cert-key-file: "/etc/ssl/certificate_and_key.pem"
+    runs:
+      # check with parameter
+      check_mongodb_connect:
+        <<: *check_mongodb_shared
+        A: connect
+        W: 2
+        C: 4
+      check_mongodb_connections:
+        <<: *check_mongodb_shared
+        A: connections
+        W: 70
+        C: 80
+        timeout: 60
+      check_mongodb_replication_lag:
+        <<: *check_mongodb_shared
+        A: replication_lag
+        W: 15
+        C: 30
+      check_mongodb_replset_state:
+        <<: *check_mongodb_shared
+        A: replset_state
+        W: 0
+        C: 0
+
+  check_rpc_status:
+    module: perun.proxy.utils.nagios.check_rpc_status
+    runs:
+      check_rpc_status:
+        u: "username"
+        p: "password"
+        d: "domain"
+        i: 1
 
-check_syncrepl:
-  module: perun.proxy.utils.nagios.check_ldap_syncrepl
-  runs:
-    check_ldap_syncrepl:
-      p: "ldaps://ldapmaster.foo:636"
-      c: "ldaps://ldapslave.foo:636"
-      b: "o=example"
-      D: "uid=nagios,ou=sysaccounts,o=example"
-      P: "bind_password"
-      n:
-      only-check-contextCSN:
-      W: 900
-      C: 3600
+  check_syncrepl:
+    module: perun.proxy.utils.nagios.check_ldap_syncrepl
+    runs:
+      check_ldap_syncrepl:
+        p: "ldaps://ldapmaster.foo:636"
+        c: "ldaps://ldapslave.foo:636"
+        b: "o=example"
+        D: "uid=nagios,ou=sysaccounts,o=example"
+        P: "bind_password"
+        n:
+        only-check-contextCSN:
+        W: 900
+        C: 3600
 
-check_exabgp_propagation:
-  module: perun.proxy.utils.nagios.check_exabgp_propagation
-  runs:
-    check_exabgp_propagation:
+  check_exabgp_propagation:
+    module: perun.proxy.utils.nagios.check_exabgp_propagation
+    runs:
+      check_exabgp_propagation:
diff --git a/perun/proxy/utils/run_probes.py b/perun/proxy/utils/run_probes.py
index a232e3c..c092074 100644
--- a/perun/proxy/utils/run_probes.py
+++ b/perun/proxy/utils/run_probes.py
@@ -41,7 +41,7 @@ def get_metrics_and_new_output(output):
     return None, output
 
 
-def run_probe(probe_name, command):
+def run_probe(probe_name, command, timeout):
     """
     Runs nagios monitoring probe and prints output in following formats:
         1) return_code probe_name metrics output
@@ -50,9 +50,17 @@ def run_probe(probe_name, command):
     metrics output format:
         metric1=val;|metric2=val2|metric3=val3;val3;;;|metric4=val4
     """
-    result = subprocess.run(
-        command, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
-    )
+    try:
+        result = subprocess.run(
+            command,
+            text=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            timeout=timeout,
+        )
+    except subprocess.TimeoutExpired:
+        print(f"3 {probe_name} - probe TIMED OUT after {timeout}s")
+        return 3
     output = re.sub("[ \t\n]+", " ", result.stdout)
     search = re.search(r" - .*", output)
     if search:
@@ -71,12 +79,17 @@ def main():
     if not config:
         return
 
-    for _, options in config.items():
+    global_timeout = config["default_timeout"]
+    for _, options in config["checks"].items():
         module = options["module"]
         for name, args in options.get("runs").items():
             command = ["python3", "-m", module]
+            timeout = global_timeout
             if args is not None:
                 for arg_name, arg_val in args.items():
+                    if arg_name == "timeout":
+                        timeout = arg_val
+                        continue
                     if len(arg_name) == 1:
                         arg_name = "-" + arg_name
                     else:
@@ -88,7 +101,7 @@ def main():
                     command.append(arg_name)
                     if arg_val is not None:
                         command.append(str(arg_val))
-            Thread(target=run_probe, args=[name, command]).start()
+            Thread(target=run_probe, args=[name, command, timeout]).start()
 
 
 if __name__ == "__main__":
-- 
GitLab