@@ -66,8 +66,12 @@ public class SelfHealingNotifier implements AnomalyNotifier {
66
66
public static final String SELF_HEALING_TOPIC_ANOMALY_ENABLED_CONFIG = "self.healing.topic.anomaly.enabled" ;
67
67
public static final String SELF_HEALING_MAINTENANCE_EVENT_ENABLED_CONFIG = "self.healing.maintenance.event.enabled" ;
68
68
public static final String BROKER_FAILURE_SELF_HEALING_THRESHOLD_MS_CONFIG = "broker.failure.self.healing.threshold.ms" ;
69
+ public static final String BROKER_FAILURE_CHECK_WITH_DELAY_MAX_RETRY_COUNT = "broker.failure.check.with.delay.max.retry.count" ;
70
+ public static final String BROKER_FAILURE_CHECK_WITH_DELAY_INTERVAL_MS = "broker.failure.check.with.delay.interval.ms" ;
69
71
static final long DEFAULT_ALERT_THRESHOLD_MS = TimeUnit .MINUTES .toMillis (15 );
70
72
static final long DEFAULT_AUTO_FIX_THRESHOLD_MS = TimeUnit .MINUTES .toMillis (30 );
73
+ static final int DEFAULT_BROKER_FAILURE_CHECK_WITH_DELAY_MAX_RETRY_COUNT = 10 ;
74
+ static final long DEFAULT_BROKER_FAILURE_CHECK_WITH_DELAY_INTERVAL_MS = TimeUnit .MINUTES .toMillis (10 );
71
75
72
76
private static final Logger LOG = LoggerFactory .getLogger (SelfHealingNotifier .class );
73
77
protected final Time _time ;
@@ -77,6 +81,8 @@ public class SelfHealingNotifier implements AnomalyNotifier {
77
81
protected final Map <AnomalyType , Long > _selfHealingEnabledHistoricalDurationMs ;
78
82
protected long _brokerFailureAlertThresholdMs ;
79
83
protected long _selfHealingThresholdMs ;
84
+ protected int _brokerFailureCheckWithDelayMaxRetryCount ;
85
+ protected long _brokerFailureCheckWithDelayIntervalMs ;
80
86
// A cache that keeps the most recent broker failure for each broker.
81
87
protected final Map <Boolean , Map <Integer , Long >> _latestFailedBrokersByAutoFixTriggered ;
82
88
@@ -248,11 +254,36 @@ public AnomalyNotificationResult onBrokerFailure(BrokerFailures brokerFailures)
248
254
result = AnomalyNotificationResult .check (delayMs );
249
255
} else {
250
256
// Reached auto fix threshold. Alert and fix if self healing is enabled and anomaly is fixable.
251
- boolean autoFixTriggered = _selfHealingEnabled .get (KafkaAnomalyType .BROKER_FAILURE ) && brokerFailures .fixable ();
252
- if (hasNewFailureToAlert (brokerFailures , autoFixTriggered )) {
253
- alert (brokerFailures , autoFixTriggered , selfHealingTimeMs , KafkaAnomalyType .BROKER_FAILURE );
257
+ boolean selfHealingEnabled = _selfHealingEnabled .get (KafkaAnomalyType .BROKER_FAILURE );
258
+ boolean brokerFailureFixable = brokerFailures .fixable ();
259
+ boolean autoFixTriggered = selfHealingEnabled && brokerFailures .fixable ();
260
+
261
+ if (!brokerFailureFixable ) {
262
+ // If broker failure is not fixable then the anomaly can be ignored
263
+ result = AnomalyNotificationResult .ignore ();
264
+ } else if (selfHealingEnabled ) {
265
+ // If self healing is enabled and broker failure is fixable the fix should be made
266
+ if (hasNewFailureToAlert (brokerFailures , autoFixTriggered )) {
267
+ alert (brokerFailures , autoFixTriggered , selfHealingTimeMs , KafkaAnomalyType .BROKER_FAILURE );
268
+ }
269
+ result = AnomalyNotificationResult .fix ();
270
+ } else {
271
+ // In the case self healing is disabled, we keep checking the anomaly until
272
+ // we try for _brokerFailureCheckWithDelayMaxRetryCount times
273
+ // After we exceed this, depending on the self healing state, we can ignore or fix the anomaly
274
+ // This check is to ensure that the broker failure is not ignored.
275
+ // The max is so that we do not keep checking forever in case self healing is disabled forever.
276
+ if (brokerFailures .brokerFailureCheckWithDelayRetryCount () <= _brokerFailureCheckWithDelayMaxRetryCount ) {
277
+ // This means that we can retry for checking with delay
278
+ if (hasNewFailureToAlert (brokerFailures , autoFixTriggered )) {
279
+ alert (brokerFailures , autoFixTriggered , selfHealingTimeMs , KafkaAnomalyType .BROKER_FAILURE );
280
+ }
281
+ result = AnomalyNotificationResult .check (_brokerFailureCheckWithDelayIntervalMs );
282
+ } else {
283
+ // This means that we have reached the max retry count and we can ignore the anomaly
284
+ result = AnomalyNotificationResult .ignore ();
285
+ }
254
286
}
255
- result = autoFixTriggered ? AnomalyNotificationResult .fix () : AnomalyNotificationResult .ignore ();
256
287
}
257
288
return result ;
258
289
}
@@ -280,6 +311,15 @@ public void configure(Map<String, ?> config) {
280
311
_brokerFailureAlertThresholdMs = alertThreshold == null ? DEFAULT_ALERT_THRESHOLD_MS : Long .parseLong (alertThreshold );
281
312
String fixThreshold = (String ) config .get (BROKER_FAILURE_SELF_HEALING_THRESHOLD_MS_CONFIG );
282
313
_selfHealingThresholdMs = fixThreshold == null ? DEFAULT_AUTO_FIX_THRESHOLD_MS : Long .parseLong (fixThreshold );
314
+
315
+ String brokerFailureCheckWithDelayMaxRetryCount = (String ) config .get (BROKER_FAILURE_CHECK_WITH_DELAY_MAX_RETRY_COUNT );
316
+ _brokerFailureCheckWithDelayMaxRetryCount = brokerFailureCheckWithDelayMaxRetryCount == null
317
+ ? DEFAULT_BROKER_FAILURE_CHECK_WITH_DELAY_MAX_RETRY_COUNT : Integer .parseInt (brokerFailureCheckWithDelayMaxRetryCount );
318
+
319
+ String brokerFailureCheckWithDelayIntervalMs = (String ) config .get (BROKER_FAILURE_CHECK_WITH_DELAY_INTERVAL_MS );
320
+ _brokerFailureCheckWithDelayIntervalMs = brokerFailureCheckWithDelayIntervalMs == null
321
+ ? DEFAULT_BROKER_FAILURE_CHECK_WITH_DELAY_INTERVAL_MS : Long .parseLong (brokerFailureCheckWithDelayIntervalMs );
322
+
283
323
if (_brokerFailureAlertThresholdMs > _selfHealingThresholdMs ) {
284
324
throw new IllegalArgumentException (String .format ("The failure detection threshold %d cannot be larger than "
285
325
+ "the auto fix threshold. %d" ,
0 commit comments