Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ public abstract class AbstractVeniceAggVersionedStats<STATS, STATS_REPORTER exte
private final Map<String, VeniceVersionedStats<STATS, STATS_REPORTER>> aggStats;
private final boolean unregisterMetricForDeletedStoreEnabled;

protected MetricsRepository getMetricsRepository() {
return metricsRepository;
}

public AbstractVeniceAggVersionedStats(
MetricsRepository metricsRepository,
ReadOnlyStoreRepository metadataRepository,
Expand Down Expand Up @@ -135,6 +139,9 @@ protected void updateStatsVersionInfo(String storeName, List<Version> existingVe
versionedStats.setFutureVersion(futureVersion);
}

// Notify subclasses that version info has changed
onVersionInfoUpdated(storeName, versionedStats.getCurrentVersion(), versionedStats.getFutureVersion());

/**
* Since versions are changed, update the total stats accordingly.
*/
Expand Down Expand Up @@ -186,4 +193,16 @@ protected int getCurrentVersion(String storeName) {
protected void updateTotalStats(String storeName) {
// no-op
}

/**
* Hook method called when version info is updated for a store.
* Subclasses can override this to react to version changes.
*
* @param storeName The store whose version info changed
* @param currentVersion The new current version
* @param futureVersion The new future version
*/
protected void onVersionInfoUpdated(String storeName, int currentVersion, int futureVersion) {
// no-op by default
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package com.linkedin.davinci.stats;

import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CLUSTER_NAME;
import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_NAME;
import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_STATE;
import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_TYPE;
import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME;
import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_VERSION_TYPE;
import static com.linkedin.venice.utils.Utils.setOf;

import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions;
import com.linkedin.venice.stats.metrics.MetricEntity;
import com.linkedin.venice.stats.metrics.MetricType;
import com.linkedin.venice.stats.metrics.MetricUnit;
import com.linkedin.venice.stats.metrics.ModuleMetricEntityInterface;
import java.util.Set;


/**
* List all metric entities for Venice server (storage node).
*/
public enum ServerMetricEntity implements ModuleMetricEntityInterface {
/**
* Heartbeat replication delay: Tracks nearline replication lag in milliseconds.
*/
INGESTION_HEARTBEAT_DELAY(
"ingestion.replication.heartbeat.delay", MetricType.HISTOGRAM, MetricUnit.MILLISECOND,
"Nearline ingestion replication lag",
setOf(
VENICE_STORE_NAME,
VENICE_CLUSTER_NAME,
VENICE_REGION_NAME,
VENICE_VERSION_TYPE,
VENICE_REPLICA_TYPE,
VENICE_REPLICA_STATE)
);

private final MetricEntity metricEntity;

ServerMetricEntity(
String name,
MetricType metricType,
MetricUnit unit,
String description,
Set<VeniceMetricsDimensions> dimensionsList) {
this.metricEntity = new MetricEntity(name, metricType, unit, description, dimensionsList);
}

@Override
public MetricEntity getMetricEntity() {
return metricEntity;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ public HeartbeatMonitoringService(
storeName,
regionNames),
leaderHeartbeatTimeStamps,
followerHeartbeatTimeStamps);
followerHeartbeatTimeStamps,
serverConfig.getClusterName());
this.heartbeatMonitoringServiceStats = heartbeatMonitoringServiceStats;
this.customizedViewRepositoryFuture = customizedViewRepositoryFuture;
this.nodeId = Utils.getHelixNodeIdentifier(serverConfig.getListenerHostname(), serverConfig.getListenerPort());
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
package com.linkedin.davinci.stats.ingestion.heartbeat;

import static com.linkedin.davinci.stats.ServerMetricEntity.INGESTION_HEARTBEAT_DELAY;
import static com.linkedin.venice.meta.Store.NON_EXISTING_VERSION;
import static com.linkedin.venice.stats.metrics.ModuleMetricEntityInterface.getUniqueMetricEntities;

import com.linkedin.davinci.stats.ServerMetricEntity;
import com.linkedin.venice.stats.OpenTelemetryMetricsSetup;
import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository;
import com.linkedin.venice.stats.dimensions.ReplicaState;
import com.linkedin.venice.stats.dimensions.ReplicaType;
import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions;
import com.linkedin.venice.stats.dimensions.VersionType;
import com.linkedin.venice.stats.metrics.MetricEntity;
import com.linkedin.venice.stats.metrics.MetricEntityStateThreeEnums;
import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap;
import io.tehuti.metrics.MetricsRepository;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;


/**
* OpenTelemetry metrics for heartbeat monitoring.
* Note: Tehuti metrics are managed separately in {@link HeartbeatStatReporter}.
*/
public class HeartbeatOtelStats {
public static final Collection<MetricEntity> SERVER_METRIC_ENTITIES =
getUniqueMetricEntities(ServerMetricEntity.class);
private final boolean emitOtelMetrics;
private final VeniceOpenTelemetryMetricsRepository otelRepository;
private final Map<VeniceMetricsDimensions, String> baseDimensionsMap;

// Per-region metric entity states
private final Map<String, MetricEntityStateThreeEnums<VersionType, ReplicaType, ReplicaState>> metricsByRegion;

// version info to avoid map lookups in hot path
private volatile int currentVersion = NON_EXISTING_VERSION;
private volatile int futureVersion = NON_EXISTING_VERSION;

public HeartbeatOtelStats(MetricsRepository metricsRepository, String storeName, String clusterName) {
this.metricsByRegion = new VeniceConcurrentHashMap<>();

OpenTelemetryMetricsSetup.OpenTelemetryMetricsSetupInfo otelSetup =
OpenTelemetryMetricsSetup.builder(metricsRepository)
.setStoreName(storeName)
.setClusterName(clusterName)
.build();

this.emitOtelMetrics = otelSetup.emitOpenTelemetryMetrics();
this.otelRepository = otelSetup.getOtelRepository();
this.baseDimensionsMap = otelSetup.getBaseDimensionsMap();
}

/**
* Returns true if OTel metrics are emitted.
*/
public boolean emitOtelMetrics() {
return emitOtelMetrics;
}

/**
* Updates the current and future version for this store.
*
* @param currentVersion The current serving version
* @param futureVersion The future/upcoming version
*/
public void updateVersionInfo(int currentVersion, int futureVersion) {
this.currentVersion = currentVersion;
this.futureVersion = futureVersion;
}

/**
* Records a heartbeat delay with all dimensional attributes to OTel metrics.
* Returns early if OTel metrics are disabled or version is invalid.
*
* @param version The version number
* @param region The region name
* @param replicaType The replica type {@link ReplicaType}
* @param replicaState The replica state {@link ReplicaState}
* @param delayMs The delay in milliseconds
*/
public void recordHeartbeatDelayOtelMetrics(
int version,
String region,
ReplicaType replicaType,
ReplicaState replicaState,
long delayMs) {
if (!emitOtelMetrics()) {
return;
}
VersionType versionType = classifyVersion(version, currentVersion, futureVersion);

MetricEntityStateThreeEnums<VersionType, ReplicaType, ReplicaState> metricState = getOrCreateMetricState(region);

// Records to OTel metrics only
metricState.record(delayMs, versionType, replicaType, replicaState);
}

/**
* Gets or creates a metric entity state for a specific region.
*/
private MetricEntityStateThreeEnums<VersionType, ReplicaType, ReplicaState> getOrCreateMetricState(String region) {
return metricsByRegion.computeIfAbsent(region, r -> {
// Add region to base dimensions
Map<VeniceMetricsDimensions, String> regionBaseDimensions = new HashMap<>(baseDimensionsMap);
regionBaseDimensions.put(VeniceMetricsDimensions.VENICE_REGION_NAME, r);

return MetricEntityStateThreeEnums.create(
INGESTION_HEARTBEAT_DELAY.getMetricEntity(),
otelRepository,
regionBaseDimensions,
VersionType.class,
ReplicaType.class,
ReplicaState.class);
});
}

/**
* Classifies a version as CURRENT or FUTURE and all other versions are considered OTHER
*
* @param version The version number to classify
* @param currentVersion The current serving version (cached)
* @param futureVersion The future/upcoming version (cached)
* @return {@link VersionType}
*/
private static VersionType classifyVersion(int version, int currentVersion, int futureVersion) {
if (version == NON_EXISTING_VERSION) {
return VersionType.OTHER;
}

return (version == currentVersion)
? VersionType.CURRENT
: ((version == futureVersion) ? VersionType.FUTURE : VersionType.OTHER);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,37 @@ public HeartbeatStat(MetricConfig metricConfig, Set<String> regions) {
defaultSensor = new WritePathLatencySensor(localRepository, metricConfig, "default-");
}

public void recordReadyToServeLeaderLag(String region, long startTime) {
long endTime = System.currentTimeMillis();
readyToServeLeaderSensors.computeIfAbsent(region, k -> defaultSensor).record(endTime - startTime, endTime);
/**
* Records the heartbeat lag for a ready-to-serve leader replica.
*
* @param region The region name
* @param delay The pre-calculated delay in milliseconds
* @param endTime The pre-calculated end time
*/
public void recordReadyToServeLeaderLag(String region, long delay, long endTime) {
readyToServeLeaderSensors.computeIfAbsent(region, k -> defaultSensor).record(delay, endTime);
}

public void recordReadyToServeFollowerLag(String region, long startTime) {
long endTime = System.currentTimeMillis();
readyToServeFollowerSensors.computeIfAbsent(region, k -> defaultSensor).record(endTime - startTime, endTime);
/**
* Records the heartbeat lag for a ready-to-serve follower replica.
*
* @param region The region name
* @param delay The pre-calculated delay in milliseconds
* @param endTime The pre-calculated end time
*/
public void recordReadyToServeFollowerLag(String region, long delay, long endTime) {
readyToServeFollowerSensors.computeIfAbsent(region, k -> defaultSensor).record(delay, endTime);
}

public void recordCatchingUpFollowerLag(String region, long startTime) {
long endTime = System.currentTimeMillis();
catchingUpFollowerSensors.computeIfAbsent(region, k -> defaultSensor).record(endTime - startTime, endTime);
/**
* Records the heartbeat lag for a catching-up follower replica.
*
* @param region The region name
* @param delay The pre-calculated delay in milliseconds (0 for squelching)
* @param endTime The pre-calculated end time
*/
public void recordCatchingUpFollowerLag(String region, long delay, long endTime) {
catchingUpFollowerSensors.computeIfAbsent(region, k -> defaultSensor).record(delay, endTime);
}

public WritePathLatencySensor getReadyToServeLeaderLag(String region) {
Expand Down
Loading
Loading