Skip to content

KAFKA-17411: Use shared cache for Task offset sums #17715

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: trunk
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ public String toString() {
private final FixedOrderMap<String, StateStoreMetadata> stores = new FixedOrderMap<>();
private final FixedOrderMap<String, StateStore> globalStores = new FixedOrderMap<>();

private final StateDirectory stateDirectory;
private final File baseDir;
private final OffsetCheckpoint checkpointFile;
private final boolean stateUpdaterEnabled;
Expand Down Expand Up @@ -218,6 +219,7 @@ public ProcessorStateManager(final TaskId taskId,

this.baseDir = stateDirectory.getOrCreateDirectoryForTask(taskId);
this.checkpointFile = new OffsetCheckpoint(stateDirectory.checkpointFileFor(taskId));
this.stateDirectory = stateDirectory;

log.debug("Created state store manager for task {}", taskId);
}
Expand Down Expand Up @@ -330,6 +332,8 @@ void initializeStoreOffsetsFromCheckpoint(final boolean storeDirIsEmpty) {
}
}

stateDirectory.updateTaskOffsets(taskId, changelogOffsets());

if (!loadedCheckpoints.isEmpty()) {
log.warn("Some loaded checkpoint offsets cannot find their corresponding state stores: {}", loadedCheckpoints);
}
Expand Down Expand Up @@ -512,10 +516,13 @@ void restore(final StateStoreMetadata storeMetadata, final List<ConsumerRecord<b
}

storeMetadata.setOffset(batchEndOffset);

// If null means the lag for this partition is not known yet
if (optionalLag.isPresent()) {
storeMetadata.setEndOffset(optionalLag.getAsLong() + batchEndOffset);
}

stateDirectory.updateTaskOffsets(taskId, changelogOffsets());
}
}

Expand Down Expand Up @@ -685,6 +692,8 @@ public void updateChangelogOffsets(final Map<TopicPartition, Long> writtenOffset
store.stateStore.name(), store.offset, store.changelogPartition);
}
}

stateDirectory.updateTaskOffsets(taskId, changelogOffsets());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.kafka.streams.internals.StreamsConfigUtils;
import org.apache.kafka.streams.processor.TaskId;
import org.apache.kafka.streams.processor.internals.metrics.StreamsMetricsImpl;
import org.apache.kafka.streams.state.internals.OffsetCheckpoint;
import org.apache.kafka.streams.state.internals.ThreadCache;

import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
Expand Down Expand Up @@ -114,6 +115,7 @@ public StateDirectoryProcessFile() {

private final StreamsConfig config;
private final ConcurrentMap<TaskId, Task> tasksForLocalState = new ConcurrentHashMap<>();
private final Map<TaskId, Long> taskOffsetSums = new ConcurrentHashMap<>();

/**
* Ensures that the state base directory as well as the application's sub-directory are created.
Expand Down Expand Up @@ -301,6 +303,46 @@ private void closeStartupTasks(final Predicate<Task> predicate) {
}
}

public Map<TaskId, Long> taskOffsetSums(final Set<TaskId> tasks) {
return taskOffsetSums.entrySet().stream()
.filter(e -> tasks.contains(e.getKey()))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
}

public void updateTaskOffsets(final TaskId taskId, final Map<TopicPartition, Long> changelogOffsets) {
if (changelogOffsets.isEmpty()) {
return;
}

taskOffsetSums.put(taskId, sumOfChangelogOffsets(taskId, changelogOffsets));
}

public void removeTaskOffsets(final TaskId taskId) {
taskOffsetSums.remove(taskId);
}

private long sumOfChangelogOffsets(final TaskId taskId, final Map<TopicPartition, Long> changelogOffsets) {
long offsetSum = 0L;
for (final Map.Entry<TopicPartition, Long> changelogEntry : changelogOffsets.entrySet()) {
final long offset = changelogEntry.getValue();

if (offset != OffsetCheckpoint.OFFSET_UNKNOWN) {
if (offset < 0) {
throw new StreamsException(
new IllegalStateException("Expected not to get a sentinel offset, but got: " + changelogEntry),
taskId);
}
offsetSum += offset;
if (offsetSum < 0) {
log.warn("Sum of changelog offsets for task {} overflowed, pinning to Long.MAX_VALUE", taskId);
return Long.MAX_VALUE;
}
}
}

return offsetSum;
}

public UUID initializeProcessId() {
if (!hasPersistentStores) {
final UUID processId = UUID.randomUUID();
Expand Down Expand Up @@ -509,6 +551,7 @@ Thread lockOwner(final TaskId taskId) {
public void close() {
if (hasPersistentStores) {
closeStartupTasks();
taskOffsetSums.clear();
try {
stateDirLock.release();
stateDirLockChannel.close();
Expand Down Expand Up @@ -589,6 +632,7 @@ private void cleanRemovedTasksCalledByCleanerThread(final long cleanupDelayMs) {
final long now = time.milliseconds();
final long lastModifiedMs = taskDir.file().lastModified();
if (now - cleanupDelayMs > lastModifiedMs) {
removeTaskOffsets(id);
log.info("{} Deleting obsolete state directory {} for task {} as {}ms has elapsed (cleanup delay is {}ms).",
logPrefix(), dirName, id, now - lastModifiedMs, cleanupDelayMs);
Utils.delete(taskDir.file());
Expand Down Expand Up @@ -626,7 +670,10 @@ private IOException maybeCleanEmptyNamedTopologyDirs(final boolean logExceptionA
);
if (namedTopologyDirs != null) {
for (final File namedTopologyDir : namedTopologyDirs) {
closeStartupTasks(task -> task.id().topologyName().equals(parseNamedTopologyFromDirectory(namedTopologyDir.getName())));
final String topologyName = parseNamedTopologyFromDirectory(namedTopologyDir.getName());
closeStartupTasks(task -> task.id().topologyName().equals(topologyName));
final Set<TaskId> taskKeys = taskOffsetSums.keySet();
taskKeys.removeIf(taskId -> taskId.topologyName().equals(topologyName));
final File[] contents = namedTopologyDir.listFiles();
if (contents != null && contents.length == 0) {
try {
Expand Down Expand Up @@ -665,6 +712,8 @@ public void clearLocalStateForNamedTopology(final String topologyName) {
}
try {
closeStartupTasks(task -> task.id().topologyName().equals(topologyName));
final Set<TaskId> taskKeys = taskOffsetSums.keySet();
taskKeys.removeIf(taskId -> taskId.topologyName().equals(topologyName));
Utils.delete(namedTopologyDir);
} catch (final IOException e) {
log.error("Hit an unexpected error while clearing local state for topology " + topologyName, e);
Expand All @@ -678,6 +727,7 @@ private void cleanStateAndTaskDirectoriesCalledByUser() throws Exception {
log.warn("Found some still-locked task directories when user requested to cleaning up the state, "
+ "since Streams is not running any more these will be ignored to complete the cleanup");
}
taskOffsetSums.clear();
final AtomicReference<Exception> firstException = new AtomicReference<>();
for (final TaskDirectory taskDir : listAllTaskDirectories()) {
final String dirName = taskDir.file().getName();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ static void closeStateManager(final Logger log,
try {
if (wipeStateStore) {
log.debug("Wiping state stores for {} task {}", taskType, id);
stateDirectory.removeTaskOffsets(id);
// we can just delete the whole dir of the task, including the state store images and the checkpoint files,
// and then we write an empty checkpoint file indicating that the previous close is graceful and we just
// need to re-bootstrap the restoration from the beginning
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,12 @@
import org.apache.kafka.streams.internals.StreamsConfigUtils.ProcessingMode;
import org.apache.kafka.streams.processor.TaskId;
import org.apache.kafka.streams.processor.assignment.ProcessId;
import org.apache.kafka.streams.processor.internals.StateDirectory.TaskDirectory;
import org.apache.kafka.streams.processor.internals.Task.State;
import org.apache.kafka.streams.processor.internals.tasks.DefaultTaskManager;
import org.apache.kafka.streams.state.internals.OffsetCheckpoint;

import org.slf4j.Logger;

import java.io.File;
import java.io.IOException;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collection;
Expand Down Expand Up @@ -1337,35 +1334,20 @@ public void signalResume() {
* Does not include stateless or non-logged tasks.
*/
public Map<TaskId, Long> taskOffsetSums() {
final Map<TaskId, Long> taskOffsetSums = new HashMap<>();

// Not all tasks will create directories, and there may be directories for tasks we don't currently own,
// so we consider all tasks that are either owned or on disk. This includes stateless tasks, which should
// just have an empty changelogOffsets map.
final Map<TaskId, Task> tasks = allTasks();
final Set<TaskId> lockedTaskDirectoriesOfNonOwnedTasksAndClosedAndCreatedTasks =
union(HashSet::new, lockedTaskDirectories, tasks.keySet());
for (final Task task : tasks.values()) {
if (task.state() != State.CREATED && task.state() != State.CLOSED) {
final Map<TopicPartition, Long> changelogOffsets = task.changelogOffsets();
if (changelogOffsets.isEmpty()) {
log.debug("Skipping to encode apparently stateless (or non-logged) offset sum for task {}",
task.id());
} else {
taskOffsetSums.put(task.id(), sumOfChangelogOffsets(task.id(), changelogOffsets));
}
lockedTaskDirectoriesOfNonOwnedTasksAndClosedAndCreatedTasks.remove(task.id());
}
}

for (final TaskId id : lockedTaskDirectoriesOfNonOwnedTasksAndClosedAndCreatedTasks) {
final File checkpointFile = stateDirectory.checkpointFileFor(id);
try {
if (checkpointFile.exists()) {
taskOffsetSums.put(id, sumOfChangelogOffsets(id, new OffsetCheckpoint(checkpointFile).read()));
}
} catch (final IOException e) {
log.warn(String.format("Exception caught while trying to read checkpoint for task %s:", id), e);
final Map<TaskId, Long> taskOffsetSums = stateDirectory.taskOffsetSums(lockedTaskDirectoriesOfNonOwnedTasksAndClosedAndCreatedTasks);

// overlay latest offsets from assigned tasks
for (final Task task : tasks.values()) {
if (task.isActive() && task.state() == State.RUNNING && taskOffsetSums.put(task.id(), Task.LATEST_OFFSET) == null) {
log.error("Could not find cached offset for assigned ACTIVE Task {}", task.id());
}
}

Expand All @@ -1384,7 +1366,7 @@ private void tryToLockAllNonEmptyTaskDirectories() {
lockedTaskDirectories.clear();

final Map<TaskId, Task> allTasks = allTasks();
for (final TaskDirectory taskDir : stateDirectory.listNonEmptyTaskDirectories()) {
for (final StateDirectory.TaskDirectory taskDir : stateDirectory.listNonEmptyTaskDirectories()) {
final File dir = taskDir.file();
final String namedTopology = taskDir.namedTopology();
try {
Expand Down Expand Up @@ -1438,34 +1420,6 @@ private void releaseLockedUnassignedTaskDirectories() {
}
}

private long sumOfChangelogOffsets(final TaskId id, final Map<TopicPartition, Long> changelogOffsets) {
long offsetSum = 0L;
for (final Map.Entry<TopicPartition, Long> changelogEntry : changelogOffsets.entrySet()) {
final long offset = changelogEntry.getValue();


if (offset == Task.LATEST_OFFSET) {
// this condition can only be true for active tasks; never for standby
// for this case, the offset of all partitions is set to `LATEST_OFFSET`
// and we "forward" the sentinel value directly
return Task.LATEST_OFFSET;
} else if (offset != OffsetCheckpoint.OFFSET_UNKNOWN) {
if (offset < 0) {
throw new StreamsException(
new IllegalStateException("Expected not to get a sentinel offset, but got: " + changelogEntry),
id);
}
offsetSum += offset;
if (offsetSum < 0) {
log.warn("Sum of changelog offsets for task {} overflowed, pinning to Long.MAX_VALUE", id);
return Long.MAX_VALUE;
}
}
}

return offsetSum;
}

private void closeTaskDirty(final Task task, final boolean removeFromTasksRegistry) {
try {
// we call this function only to flush the case if necessary
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ public void testCloseStateManagerWithStateStoreWipeOut() {
"logPrefix:", false, true, stateManager, stateDirectory, TaskType.ACTIVE);

inOrder.verify(stateManager).close();
inOrder.verify(stateDirectory).removeTaskOffsets(taskId);
inOrder.verify(stateDirectory).unlock(taskId);
verifyNoMoreInteractions(stateManager, stateDirectory);
}
Expand Down Expand Up @@ -211,6 +212,7 @@ public void testCloseStateManagerWithStateStoreWipeOutRethrowWrappedIOException(
}

inOrder.verify(stateManager).close();
inOrder.verify(stateDirectory).removeTaskOffsets(taskId);
inOrder.verify(stateDirectory).unlock(taskId);
verifyNoMoreInteractions(stateManager, stateDirectory);
}
Expand Down
Loading
Loading