1
+ from flask import Flask , jsonify
2
+ import logging
3
+ import os
4
+ import subprocess
5
+
6
+ app = Flask (__name__ )
7
+
8
+ def str_to_bool (value ):
9
+ return str (value ).strip ().lower () in {
10
+ 'true' ,
11
+ '1' ,
12
+ 't' ,
13
+ 'y' ,
14
+ 'yes'
15
+ }
16
+
17
+ class Config :
18
+ SC4S_DEST_SPLUNK_HEC_DEFAULT_URL = os .getenv ('SC4S_DEST_SPLUNK_HEC_DEFAULT_URL' )
19
+ HEALTHCHECK_PORT = int (os .getenv ('SC4S_LISTEN_STATUS_PORT' , '8080' ))
20
+ CHECK_QUEUE_SIZE = str_to_bool (os .getenv ('HEALTHCHECK_CHECK_QUEUE_SIZE' , "false" ))
21
+ MAX_QUEUE_SIZE = int (os .getenv ('HEALTHCHECK_MAX_QUEUE_SIZE' , '10000' ))
22
+
23
+ logging .basicConfig (
24
+ format = f"%(asctime)s - healthcheck.py - %(levelname)s - %(message)s" ,
25
+ datefmt = "%Y-%m-%d %H:%M:%S"
26
+ )
27
+ logger = logging .getLogger (__name__ )
28
+
29
+ def check_syslog_ng_health () -> bool :
30
+ """Check the health of the syslog-ng process."""
31
+ try :
32
+ result = subprocess .run (
33
+ ['syslog-ng-ctl' , 'healthcheck' , '-t' , '1' ],
34
+ capture_output = True ,
35
+ text = True ,
36
+ timeout = 5
37
+ )
38
+ if result .returncode == 0 :
39
+ return True
40
+
41
+ logger .error (f"syslog-ng healthcheck failed: { result .stderr .strip ()} " )
42
+ return False
43
+ except subprocess .TimeoutExpired :
44
+ logger .error ("syslog-ng healthcheck timed out." )
45
+ return False
46
+ except Exception as e :
47
+ logger .exception (f"Unexpected error during syslog-ng healthcheck: { e } " )
48
+ return False
49
+
50
+ def check_queue_size (
51
+ sc4s_dest_splunk_hec_default = Config .SC4S_DEST_SPLUNK_HEC_DEFAULT_URL ,
52
+ max_queue_size = Config .MAX_QUEUE_SIZE
53
+ ) -> bool :
54
+ """Check syslog-ng queue size and compare it against the configured maximum limit."""
55
+ if not sc4s_dest_splunk_hec_default :
56
+ logger .error (
57
+ "SC4S_DEST_SPLUNK_HEC_DEFAULT_URL not configured. "
58
+ "Ensure the default HEC destination is set, or disable HEALTHCHECK_CHECK_QUEUE_SIZE."
59
+ )
60
+ return False
61
+
62
+ try :
63
+ result = subprocess .run (
64
+ ['syslog-ng-ctl' , 'stats' ],
65
+ capture_output = True ,
66
+ text = True ,
67
+ timeout = 5
68
+ )
69
+ if result .returncode != 0 :
70
+ logger .error (f"syslog-ng stats command failed: { result .stderr .strip ()} " )
71
+ return False
72
+
73
+ stats = result .stdout .splitlines ()
74
+ destination_stat = next (
75
+ (s for s in stats if ";queued;" in s and sc4s_dest_splunk_hec_default in s ),
76
+ None
77
+ )
78
+ if not destination_stat :
79
+ logger .error ("No matching queue stats found for the destination URL." )
80
+ return False
81
+
82
+ queue_size = int (destination_stat .split (";" )[- 1 ])
83
+ if queue_size > max_queue_size :
84
+ logger .warning (
85
+ f"Queue size { queue_size } exceeds the maximum limit of { max_queue_size } ."
86
+ )
87
+ return False
88
+
89
+ return True
90
+ except subprocess .TimeoutExpired :
91
+ logger .error ("syslog-ng stats command timed out." )
92
+ return False
93
+ except Exception as e :
94
+ logger .exception (f"Unexpected error checking queue size: { e } " )
95
+ return False
96
+
97
+ @app .route ('/health' , methods = ['GET' ])
98
+ def healthcheck ():
99
+ if Config .CHECK_QUEUE_SIZE :
100
+ if not check_syslog_ng_health ():
101
+ return jsonify ({'status' : 'unhealthy: syslog-ng healthcheck failed' }), 503
102
+ if not check_queue_size ():
103
+ return jsonify ({'status' : 'unhealthy: queue size exceeded limit' }), 503
104
+ else :
105
+ if not check_syslog_ng_health ():
106
+ return jsonify ({'status' : 'unhealthy: syslog-ng healthcheck failed' }), 503
107
+
108
+ logger .info ("Service is healthy." )
109
+ return jsonify ({'status' : 'healthy' }), 200
110
+
111
+
112
+ if __name__ == '__main__' :
113
+ app .run (host = '0.0.0.0' , port = Config .HEALTHCHECK_PORT )
0 commit comments