diff --git a/README.md b/README.md index 3de4ff97f3724b2f959395a9c4e4d61416d47e2f..99274e159035110eb7fd9eeeec24937e951b5f53 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,8 @@ There are several extras which are required only for some scripts: - script designed to execute multiple monitoring probes - output is compatible with CheckMK - it is required to put configuration file to `/etc/run_probes_cfg.yaml` +- default timeout in seconds for all checks is set by `default_timeout` in the config, and each check can optionally have its own `timeout` setting overriding the default one. + For usage instructions, run: diff --git a/config_templates/run_probes_cfg.yaml b/config_templates/run_probes_cfg.yaml index cd972df9c743fa3b941ae1cbed6a28b78bd69f19..d8099f70123f8c7890ad87a5c8461e3b293d26b8 100644 --- a/config_templates/run_probes_cfg.yaml +++ b/config_templates/run_probes_cfg.yaml @@ -1,60 +1,63 @@ -check_mongodb: - # module with checks - module: perun.proxy.utils.nagios.check_mongodb - check_mongodb_shared: &check_mongodb_shared - host: "hostname" - u: "username" - p: "password" - tls: true - tls-ca-file: "/etc/ssl/chain.crt" - tls-cert-key-file: "/etc/ssl/certificate_and_key.pem" - runs: - # check with parameter - check_mongodb_connect: - <<: *check_mongodb_shared - A: connect - W: 2 - C: 4 - check_mongodb_connections: - <<: *check_mongodb_shared - A: connections - W: 70 - C: 80 - check_mongodb_replication_lag: - <<: *check_mongodb_shared - A: replication_lag - W: 15 - C: 30 - check_mongodb_replset_state: - <<: *check_mongodb_shared - A: replset_state - W: 0 - C: 0 - -check_rpc_status: - module: perun.proxy.utils.nagios.check_rpc_status - runs: - check_rpc_status: +default_timeout: 30 # in seconds +checks: + check_mongodb: + # module with checks + module: perun.proxy.utils.nagios.check_mongodb + check_mongodb_shared: &check_mongodb_shared + host: "hostname" u: "username" p: "password" - d: "domain" - i: 1 + tls: true + tls-ca-file: "/etc/ssl/chain.crt" + tls-cert-key-file: "/etc/ssl/certificate_and_key.pem" + runs: + # check with parameter + check_mongodb_connect: + <<: *check_mongodb_shared + A: connect + W: 2 + C: 4 + check_mongodb_connections: + <<: *check_mongodb_shared + A: connections + W: 70 + C: 80 + timeout: 60 + check_mongodb_replication_lag: + <<: *check_mongodb_shared + A: replication_lag + W: 15 + C: 30 + check_mongodb_replset_state: + <<: *check_mongodb_shared + A: replset_state + W: 0 + C: 0 + + check_rpc_status: + module: perun.proxy.utils.nagios.check_rpc_status + runs: + check_rpc_status: + u: "username" + p: "password" + d: "domain" + i: 1 -check_syncrepl: - module: perun.proxy.utils.nagios.check_ldap_syncrepl - runs: - check_ldap_syncrepl: - p: "ldaps://ldapmaster.foo:636" - c: "ldaps://ldapslave.foo:636" - b: "o=example" - D: "uid=nagios,ou=sysaccounts,o=example" - P: "bind_password" - n: - only-check-contextCSN: - W: 900 - C: 3600 + check_syncrepl: + module: perun.proxy.utils.nagios.check_ldap_syncrepl + runs: + check_ldap_syncrepl: + p: "ldaps://ldapmaster.foo:636" + c: "ldaps://ldapslave.foo:636" + b: "o=example" + D: "uid=nagios,ou=sysaccounts,o=example" + P: "bind_password" + n: + only-check-contextCSN: + W: 900 + C: 3600 -check_exabgp_propagation: - module: perun.proxy.utils.nagios.check_exabgp_propagation - runs: - check_exabgp_propagation: + check_exabgp_propagation: + module: perun.proxy.utils.nagios.check_exabgp_propagation + runs: + check_exabgp_propagation: diff --git a/perun/proxy/utils/run_probes.py b/perun/proxy/utils/run_probes.py index a232e3cafb25cf3873a3bdd05cc699d33249fd85..c092074c84386e7ed6d3680c48eeef6c277a1df4 100644 --- a/perun/proxy/utils/run_probes.py +++ b/perun/proxy/utils/run_probes.py @@ -41,7 +41,7 @@ def get_metrics_and_new_output(output): return None, output -def run_probe(probe_name, command): +def run_probe(probe_name, command, timeout): """ Runs nagios monitoring probe and prints output in following formats: 1) return_code probe_name metrics output @@ -50,9 +50,17 @@ def run_probe(probe_name, command): metrics output format: metric1=val;|metric2=val2|metric3=val3;val3;;;|metric4=val4 """ - result = subprocess.run( - command, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT - ) + try: + result = subprocess.run( + command, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + timeout=timeout, + ) + except subprocess.TimeoutExpired: + print(f"3 {probe_name} - probe TIMED OUT after {timeout}s") + return 3 output = re.sub("[ \t\n]+", " ", result.stdout) search = re.search(r" - .*", output) if search: @@ -71,12 +79,17 @@ def main(): if not config: return - for _, options in config.items(): + global_timeout = config["default_timeout"] + for _, options in config["checks"].items(): module = options["module"] for name, args in options.get("runs").items(): command = ["python3", "-m", module] + timeout = global_timeout if args is not None: for arg_name, arg_val in args.items(): + if arg_name == "timeout": + timeout = arg_val + continue if len(arg_name) == 1: arg_name = "-" + arg_name else: @@ -88,7 +101,7 @@ def main(): command.append(arg_name) if arg_val is not None: command.append(str(arg_val)) - Thread(target=run_probe, args=[name, command]).start() + Thread(target=run_probe, args=[name, command, timeout]).start() if __name__ == "__main__":