Skip to content

Commit f090969

Browse files
committed
feat: Add a health check endpoint (#2670)
1 parent 4bc79a7 commit f090969

19 files changed

+1026
-351
lines changed

.github/workflows/ci-lite.yaml

+54
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,58 @@ jobs:
353353
-n 1 \
354354
-m 'name_cache'
355355
356+
test-healthcheck:
357+
runs-on: ubuntu-latest
358+
needs:
359+
- meta
360+
- build_action
361+
container:
362+
image: python:3.9-buster
363+
services:
364+
sc4s:
365+
image: ${{ needs.meta.outputs.container_base }}
366+
ports:
367+
- 8090:8090
368+
- 514:514
369+
env:
370+
SC4S_DEST_SPLUNK_HEC_DEFAULT_URL: https://splunk:8088
371+
SC4S_DEST_SPLUNK_HEC_DEFAULT_TOKEN: 00000000-0000-0000-0000-000000000000
372+
SC4S_LISTEN_STATUS_PORT: 8090 # the default is 8080
373+
HEALTHCHECK_CHECK_QUEUE_SIZE: yes
374+
HEALTHCHECK_MAX_QUEUE_SIZE: 10000
375+
steps:
376+
- name: Checkout
377+
uses: actions/checkout@v4
378+
with:
379+
submodules: false
380+
persist-credentials: false
381+
- name: Install requests
382+
run: pip3 install requests
383+
- name: Return status 'healthy'
384+
run: python3 tests/test_healthcheck_healthy.py --host sc4s --port 8090
385+
- name: Return status 'queue size exceeded limit'
386+
run: python3 tests/test_healthcheck_queue_size_limit.py --limit 10000 --host sc4s --port 8090
387+
388+
test-healthcheck-unit-tests:
389+
runs-on: ubuntu-latest
390+
needs:
391+
- meta
392+
- build_action
393+
container:
394+
image: python:3.9-buster
395+
steps:
396+
- name: Checkout
397+
uses: actions/checkout@v4
398+
with:
399+
submodules: false
400+
persist-credentials: false
401+
- name: Install dependencies
402+
run: |
403+
pip3 install poetry
404+
poetry install
405+
- name: Run tests
406+
run: poetry run pytest tests/test_healthcheck_unit_tests.py
407+
356408
release:
357409
name: Release
358410
runs-on: ubuntu-latest
@@ -362,6 +414,8 @@ jobs:
362414
- test-container
363415
- test-ipv4-name-cache
364416
- test-ipv6-name-cache
417+
- test-healthcheck
418+
- test-healthcheck-unit-tests
365419
steps:
366420
- uses: actions/checkout@v4
367421
with:

.github/workflows/ci-main.yaml

+54
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,58 @@ jobs:
355355
-n 1 \
356356
-m 'name_cache'
357357
358+
test-healthcheck:
359+
runs-on: ubuntu-latest
360+
needs:
361+
- meta
362+
- build_action
363+
container:
364+
image: python:3.9-buster
365+
services:
366+
sc4s:
367+
image: ${{ needs.meta.outputs.container_base }}
368+
ports:
369+
- 8090:8090
370+
- 514:514
371+
env:
372+
SC4S_DEST_SPLUNK_HEC_DEFAULT_URL: https://splunk:8088
373+
SC4S_DEST_SPLUNK_HEC_DEFAULT_TOKEN: 00000000-0000-0000-0000-000000000000
374+
SC4S_LISTEN_STATUS_PORT: 8090 # the default is 8080
375+
HEALTHCHECK_CHECK_QUEUE_SIZE: yes
376+
HEALTHCHECK_MAX_QUEUE_SIZE: 10000
377+
steps:
378+
- name: Checkout
379+
uses: actions/checkout@v4
380+
with:
381+
submodules: false
382+
persist-credentials: false
383+
- name: Install requests
384+
run: pip3 install requests
385+
- name: Return status 'healthy'
386+
run: python3 tests/test_healthcheck_healthy.py --host sc4s --port 8090
387+
- name: Return status 'queue size exceeded limit'
388+
run: python3 tests/test_healthcheck_queue_size_limit.py --limit 10000 --host sc4s --port 8090
389+
390+
test-healthcheck-unit-tests:
391+
runs-on: ubuntu-latest
392+
needs:
393+
- meta
394+
- build_action
395+
container:
396+
image: python:3.9-buster
397+
steps:
398+
- name: Checkout
399+
uses: actions/checkout@v4
400+
with:
401+
submodules: false
402+
persist-credentials: false
403+
- name: Install dependencies
404+
run: |
405+
pip3 install poetry
406+
poetry install
407+
- name: Run tests
408+
run: poetry run pytest tests/test_healthcheck_unit_tests.py
409+
358410
mike:
359411
runs-on: ubuntu-latest
360412
if: ${{ github.ref == 'refs/heads/main' }} || ${{ github.ref == 'refs/heads/develop' }}
@@ -387,6 +439,8 @@ jobs:
387439
- test-container
388440
- test-ipv4-name-cache
389441
- test-ipv6-name-cache
442+
- test-healthcheck
443+
- test-healthcheck-unit-tests
390444
- mike
391445
steps:
392446
- uses: actions/checkout@v4

charts/splunk-connect-for-syslog/templates/statefulset.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,9 @@ spec:
143143
- name: ietf-dflt-tls
144144
containerPort: 5425
145145
protocol: TCP
146+
- name: health
147+
containerPort: 8080
148+
protocol: TCP
146149
{{- if .Values.sc4s }}
147150
{{- if .Values.sc4s.vendor_product }}
148151
{{- range $vp := .Values.sc4s.vendor_product }}

docs/gettingstarted/ansible-docker-podman.md

+5
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,11 @@ SC4S_ENV_CHECK_HEC: Splunk HEC connection test successful to index=main for sour
7979
SC4S_ENV_CHECK_HEC: Splunk HEC connection test successful to index=main for sourcetype=sc4s:events...
8080
syslog-ng checking config
8181
sc4s version=v1.36.0
82+
Configuring health check port: 8080
83+
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
84+
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
85+
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
86+
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
8287
starting syslog-ng
8388
```
8489

docs/gettingstarted/ansible-docker-swarm.md

+5
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,11 @@ SC4S_ENV_CHECK_HEC: Splunk HEC connection test successful to index=main for sour
103103
SC4S_ENV_CHECK_HEC: Splunk HEC connection test successful to index=main for sourcetype=sc4s:events...
104104
syslog-ng checking config
105105
sc4s version=v1.36.0
106+
Configuring health check port: 8080
107+
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
108+
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
109+
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
110+
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
106111
starting syslog-ng
107112
```
108113

docs/gettingstarted/ansible-mk8s.md

+5
Original file line numberDiff line numberDiff line change
@@ -75,5 +75,10 @@ SC4S_ENV_CHECK_HEC: Splunk HEC connection test successful to index=main for sour
7575
SC4S_ENV_CHECK_HEC: Splunk HEC connection test successful to index=main for sourcetype=sc4s:events...
7676
syslog-ng checking config
7777
sc4s version=v1.36.0
78+
Configuring health check port: 8080
79+
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
80+
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
81+
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
82+
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
7883
starting syslog-ng
7984
```

docs/gettingstarted/docker-compose-MacOS.md

+5
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,11 @@ You should see events similar to those below in the output:
120120
```ini
121121
syslog-ng checking config
122122
sc4s version=v1.36.0
123+
Configuring health check port: 8080
124+
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
125+
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
126+
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
127+
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
123128
starting syslog-ng
124129
```
125130

docs/gettingstarted/docker-compose.md

+5
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,11 @@ You should see events similar to those below in the output:
112112
```ini
113113
syslog-ng checking config
114114
sc4s version=v1.36.0
115+
Configuring health check port: 8080
116+
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
117+
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
118+
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
119+
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
115120
starting syslog-ng
116121
```
117122

docs/gettingstarted/docker-systemd-general.md

+5
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,11 @@ You should see events similar to those below in the output:
116116
```ini
117117
syslog-ng checking config
118118
sc4s version=v1.36.0
119+
Configuring health check port: 8080
120+
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
121+
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
122+
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
123+
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
119124
starting syslog-ng
120125
```
121126

docs/gettingstarted/podman-systemd-general.md

+5
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ You should see events similar to those below in the output:
109109
```ini
110110
syslog-ng checking config
111111
sc4s version=v1.36.0
112+
Configuring health check port: 8080
113+
[2025-01-11 18:31:08 +0000] [135] [INFO] Starting gunicorn 23.0.0
114+
[2025-01-11 18:31:08 +0000] [135] [INFO] Listening at: http://0.0.0.0:8080 (135)
115+
[2025-01-11 18:31:08 +0000] [135] [INFO] Using worker: sync
116+
[2025-01-11 18:31:08 +0000] [138] [INFO] Booting worker with pid: 138
112117
starting syslog-ng
113118
```
114119

package/Dockerfile

+1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ COPY package/etc/local_config /etc/syslog-ng/local_config
7474
COPY package/etc/local_config /etc/syslog-ng/local_config
7575
COPY package/sbin/entrypoint.sh /
7676
COPY package/sbin/healthcheck.sh /
77+
COPY package/sbin/healthcheck.py /
7778
COPY package/sbin/source_ports_validator.py /
7879

7980
ENV SC4S_CONTAINER_OPTS=--no-caps

package/Dockerfile.lite

+1
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ COPY package/lite/etc/addons /etc/syslog-ng/addons
9696

9797
COPY package/sbin/entrypoint.sh /
9898
COPY package/sbin/healthcheck.sh /
99+
COPY package/sbin/healthcheck.py /
99100
COPY package/sbin/source_ports_validator.py /
100101

101102

package/sbin/entrypoint.sh

+3
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,9 @@ echo sc4s version=$(cat $SC4S_ETC/VERSION)
224224
echo sc4s version=$(cat $SC4S_ETC/VERSION) >>$SC4S_VAR/log/syslog-ng.out
225225
$SC4S_SBIN/syslog-ng --no-caps $SC4S_CONTAINER_OPTS -s >>$SC4S_VAR/log/syslog-ng.out 2>$SC4S_VAR/log/syslog-ng.err
226226

227+
echo "Configuring the health check port to: $SC4S_LISTEN_STATUS_PORT"
228+
nohup gunicorn -b 0.0.0.0:$SC4S_LISTEN_STATUS_PORT healthcheck:app &
229+
227230
# OPTIONAL for BYOE: Comment out/remove all remaining lines and launch syslog-ng directly from systemd
228231
if [ "${SC4S_DEBUG_CONTAINER}" == "yes" ]
229232
then

package/sbin/healthcheck.py

+113
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
from flask import Flask, jsonify
2+
import logging
3+
import os
4+
import subprocess
5+
6+
app = Flask(__name__)
7+
8+
def str_to_bool(value):
9+
return str(value).strip().lower() in {
10+
'true',
11+
'1',
12+
't',
13+
'y',
14+
'yes'
15+
}
16+
17+
class Config:
18+
SC4S_DEST_SPLUNK_HEC_DEFAULT_URL = os.getenv('SC4S_DEST_SPLUNK_HEC_DEFAULT_URL')
19+
HEALTHCHECK_PORT = int(os.getenv('SC4S_LISTEN_STATUS_PORT', '8080'))
20+
CHECK_QUEUE_SIZE = str_to_bool(os.getenv('HEALTHCHECK_CHECK_QUEUE_SIZE', "false"))
21+
MAX_QUEUE_SIZE = int(os.getenv('HEALTHCHECK_MAX_QUEUE_SIZE', '10000'))
22+
23+
logging.basicConfig(
24+
format=f"%(asctime)s - healthcheck.py - %(levelname)s - %(message)s",
25+
datefmt="%Y-%m-%d %H:%M:%S"
26+
)
27+
logger = logging.getLogger(__name__)
28+
29+
def check_syslog_ng_health() -> bool:
30+
"""Check the health of the syslog-ng process."""
31+
try:
32+
result = subprocess.run(
33+
['syslog-ng-ctl', 'healthcheck', '-t', '1'],
34+
capture_output=True,
35+
text=True,
36+
timeout=5
37+
)
38+
if result.returncode == 0:
39+
return True
40+
41+
logger.error(f"syslog-ng healthcheck failed: {result.stderr.strip()}")
42+
return False
43+
except subprocess.TimeoutExpired:
44+
logger.error("syslog-ng healthcheck timed out.")
45+
return False
46+
except Exception as e:
47+
logger.exception(f"Unexpected error during syslog-ng healthcheck: {e}")
48+
return False
49+
50+
def check_queue_size(
51+
sc4s_dest_splunk_hec_default=Config.SC4S_DEST_SPLUNK_HEC_DEFAULT_URL,
52+
max_queue_size=Config.MAX_QUEUE_SIZE
53+
) -> bool:
54+
"""Check syslog-ng queue size and compare it against the configured maximum limit."""
55+
if not sc4s_dest_splunk_hec_default:
56+
logger.error(
57+
"SC4S_DEST_SPLUNK_HEC_DEFAULT_URL not configured. "
58+
"Ensure the default HEC destination is set, or disable HEALTHCHECK_CHECK_QUEUE_SIZE."
59+
)
60+
return False
61+
62+
try:
63+
result = subprocess.run(
64+
['syslog-ng-ctl', 'stats'],
65+
capture_output=True,
66+
text=True,
67+
timeout=5
68+
)
69+
if result.returncode != 0:
70+
logger.error(f"syslog-ng stats command failed: {result.stderr.strip()}")
71+
return False
72+
73+
stats = result.stdout.splitlines()
74+
destination_stat = next(
75+
(s for s in stats if ";queued;" in s and sc4s_dest_splunk_hec_default in s),
76+
None
77+
)
78+
if not destination_stat:
79+
logger.error("No matching queue stats found for the destination URL.")
80+
return False
81+
82+
queue_size = int(destination_stat.split(";")[-1])
83+
if queue_size > max_queue_size:
84+
logger.warning(
85+
f"Queue size {queue_size} exceeds the maximum limit of {max_queue_size}."
86+
)
87+
return False
88+
89+
return True
90+
except subprocess.TimeoutExpired:
91+
logger.error("syslog-ng stats command timed out.")
92+
return False
93+
except Exception as e:
94+
logger.exception(f"Unexpected error checking queue size: {e}")
95+
return False
96+
97+
@app.route('/health', methods=['GET'])
98+
def healthcheck():
99+
if Config.CHECK_QUEUE_SIZE:
100+
if not check_syslog_ng_health():
101+
return jsonify({'status': 'unhealthy: syslog-ng healthcheck failed'}), 503
102+
if not check_queue_size():
103+
return jsonify({'status': 'unhealthy: queue size exceeded limit'}), 503
104+
else:
105+
if not check_syslog_ng_health():
106+
return jsonify({'status': 'unhealthy: syslog-ng healthcheck failed'}), 503
107+
108+
logger.info("Service is healthy.")
109+
return jsonify({'status': 'healthy'}), 200
110+
111+
112+
if __name__ == '__main__':
113+
app.run(host='0.0.0.0', port=Config.HEALTHCHECK_PORT)

0 commit comments

Comments
 (0)