Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions API.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions lib/common/monitoring/alarms/KinesisDataAnalyticsAlarmFactory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,26 @@ export class KinesisDataAnalyticsAlarmFactory {
});
}

addFullRestartRateAlarm(
metric: MetricWithAlarmSupport,
props: ErrorRateThreshold,
disambiguator?: string,
) {
return this.alarmFactory.addAlarm(metric, {
treatMissingData:
props.treatMissingDataOverride ?? TreatMissingData.BREACHING,
comparisonOperator:
props.comparisonOperatorOverride ??
ComparisonOperator.GREATER_THAN_THRESHOLD,
...props,
disambiguator,
threshold: props.maxErrorRate,
alarmNameSuffix: "FullRestartRate",
alarmDescription: "Full restart rate is too high",
alarmDedupeStringSuffix: "KDAFullRestartRateAlarm",
});
}

addCheckpointFailureCountAlarm(
metric: MetricWithAlarmSupport,
props: ErrorCountThreshold,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import {
BaseMetricFactoryProps,
MetricFactory,
MetricStatistic,
RateComputationMethod,
} from "../../common";

export interface KinesisDataAnalyticsMetricFactoryProps
Expand Down Expand Up @@ -120,11 +119,34 @@ export class KinesisDataAnalyticsMetricFactory extends BaseMetricFactory<Kinesis
}

metricCheckpointFailureRate() {
return this.metricFactory.toRate(
this.metricNumberOfFailedCheckpointsCount(),
RateComputationMethod.PER_HOUR,
false,
"checkpoints",
// Flink reports this metric as the latest sum for the lifecycle of a job.
// Therefore, we truly care about rate of change
return this.metricFactory.createMetricMath(
"RATE(numberOfFailedCheckpoints)",
{
numberOfFailedCheckpoints: this.metricNumberOfFailedCheckpointsCount(),
},
"Checkpoint Failure Rate",
undefined,
undefined,
this.region,
this.account,
);
}

metricFullRestartRate() {
// Flink reports this metric as the latest sum for the lifecycle of a job.
// Therefore, we truly care about rate of change
return this.metricFactory.createMetricMath(
"RATE(fullRestarts)",
{
fullRestarts: this.metricFullRestartsCount(),
},
"Full Restart Rate",
undefined,
undefined,
this.region,
this.account,
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ export interface KinesisDataAnalyticsMonitoringOptions

readonly addFullRestartCountAlarm?: Record<string, FullRestartCountThreshold>;

readonly addFullRestartRateAlarm?: Record<string, ErrorRateThreshold>;

readonly addCheckpointFailureCountAlarm?: Record<string, ErrorCountThreshold>;

readonly addCheckpointFailureRateAlarm?: Record<string, ErrorRateThreshold>;
Expand All @@ -54,6 +56,7 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
readonly kdaAlarmFactory: KinesisDataAnalyticsAlarmFactory;
readonly downtimeAnnotations: HorizontalAnnotation[];
readonly fullRestartAnnotations: HorizontalAnnotation[];
readonly fullRestartRateAnnotations: HorizontalAnnotation[];
readonly checkpointFailureCountAnnotations: HorizontalAnnotation[];
readonly checkpointFailureRateAnnotations: HorizontalAnnotation[];

Expand All @@ -68,6 +71,7 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
readonly oldGenerationGCCountMetric: MetricWithAlarmSupport;
readonly oldGenerationGCTimeMsMetric: MetricWithAlarmSupport;
readonly checkpointFailureRateMetric: MetricWithAlarmSupport;
readonly fullRestartRateMetric: MetricWithAlarmSupport;

constructor(
scope: MonitoringScope,
Expand All @@ -90,6 +94,7 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
this.kdaAlarmFactory = new KinesisDataAnalyticsAlarmFactory(alarmFactory);
this.downtimeAnnotations = [];
this.fullRestartAnnotations = [];
this.fullRestartRateAnnotations = [];
this.checkpointFailureCountAnnotations = [];
this.checkpointFailureRateAnnotations = [];

Expand Down Expand Up @@ -117,6 +122,7 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
metricFactory.metricOldGenerationGCTimeMs();
this.checkpointFailureRateMetric =
metricFactory.metricCheckpointFailureRate();
this.fullRestartRateMetric = metricFactory.metricFullRestartRate();

for (const disambiguator in props.addDowntimeAlarm) {
const alarmProps = props.addDowntimeAlarm[disambiguator];
Expand All @@ -140,6 +146,17 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
this.addAlarm(createdAlarm);
}

for (const disambiguator in props.addFullRestartRateAlarm) {
const alarmProps = props.addFullRestartRateAlarm[disambiguator];
const createdAlarm = this.kdaAlarmFactory.addFullRestartRateAlarm(
this.fullRestartRateMetric,
alarmProps,
disambiguator,
);
this.fullRestartRateAnnotations.push(createdAlarm.annotation);
this.addAlarm(createdAlarm);
}

for (const disambiguator in props.addCheckpointFailureCountAlarm) {
const alarmProps = props.addCheckpointFailureCountAlarm[disambiguator];
const createdAlarm = this.kdaAlarmFactory.addCheckpointFailureCountAlarm(
Expand Down Expand Up @@ -230,6 +247,9 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
left: [this.fullRestartsCountMetric],
leftYAxis: CountAxisFromZero,
leftAnnotations: this.fullRestartAnnotations,
right: [this.fullRestartRateMetric],
rightYAxis: RateAxisFromZero,
rightAnnotations: this.fullRestartRateAnnotations,
});
}

Expand Down
6 changes: 3 additions & 3 deletions test/facade/__snapshots__/MonitoringAspect.test.ts.snap

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading