apache · cadonna · Jun 16, 2022 · May 13, 2022 · May 18, 2022 · May 19, 2022
diff --git a/checkstyle/suppressions.xml b/checkstyle/suppressions.xml
@@ -225,7 +225,7 @@
               files="(EosV2UpgradeIntegrationTest|KStreamKStreamJoinTest|KTableKTableForeignKeyJoinIntegrationTest|RocksDBGenericOptionsToDbOptionsColumnFamilyOptionsAdapterTest|RelationalSmokeTest|MockProcessorContextStateStoreTest).java"/>
 
     <suppress checks="JavaNCSS"
-              files="(EosV2UpgradeIntegrationTest|KStreamKStreamJoinTest|TaskManagerTest).java"/>
+              files="(EosV2UpgradeIntegrationTest|KStreamKStreamJoinTest|StreamThreadTest|TaskManagerTest).java"/>
 
     <suppress checks="NPathComplexity"
               files="(EosV2UpgradeIntegrationTest|EosTestDriver|KStreamKStreamJoinTest|KTableKTableForeignKeyJoinIntegrationTest|RelationalSmokeTest|MockProcessorContextStateStoreTest).java"/>

diff --git a/streams/src/main/java/org/apache/kafka/streams/KafkaStreams.java b/streams/src/main/java/org/apache/kafka/streams/KafkaStreams.java
@@ -56,7 +56,6 @@
 import org.apache.kafka.streams.processor.internals.ClientUtils;
 import org.apache.kafka.streams.processor.internals.DefaultKafkaClientSupplier;
 import org.apache.kafka.streams.processor.internals.GlobalStreamThread;
-import org.apache.kafka.streams.processor.internals.GlobalStreamThread.State;
 import org.apache.kafka.streams.processor.internals.StateDirectory;
 import org.apache.kafka.streams.processor.internals.StreamThread;
 import org.apache.kafka.streams.processor.internals.StreamsMetadataState;
@@ -65,6 +64,7 @@
 import org.apache.kafka.streams.processor.internals.TopologyMetadata;
 import org.apache.kafka.streams.processor.internals.assignment.AssignorError;
 import org.apache.kafka.streams.processor.internals.metrics.StreamsMetricsImpl;
+import org.apache.kafka.streams.processor.internals.namedtopology.NamedTopology;
 import org.apache.kafka.streams.query.FailureReason;
 import org.apache.kafka.streams.query.PositionBound;
 import org.apache.kafka.streams.query.QueryConfig;
@@ -111,6 +111,7 @@
 import static org.apache.kafka.streams.internals.ApiUtils.validateMillisecondDuration;
 import static org.apache.kafka.streams.internals.StreamsConfigUtils.getTotalCacheSize;
 import static org.apache.kafka.streams.processor.internals.ClientUtils.fetchEndOffsets;
+import static org.apache.kafka.streams.processor.internals.TopologyMetadata.UNNAMED_TOPOLOGY;
 
 /**
  * A Kafka client that allows for performing continuous computation on input coming from one or more input topics and
@@ -1662,6 +1663,51 @@ public <T> T store(final StoreQueryParameters<T> storeQueryParameters) {
         return queryableStoreProvider.getStore(storeQueryParameters);
     }
 
+    /**
+     *  This method pauses processing for the KafkaStreams instance.
+     *
+     *  Paused topologies will only skip over a) processing, b) punctuation, and c) standby tasks.
+     *  Notably, paused topologies will still poll Kafka consumers, and commit offsets.
+     *  This method sets transient state that is not maintained or managed among instances.
+     *  Note that pause() can be called before start() in order to start a KafkaStreams instance
+     *  in a manner where the processing is paused as described, but the consumers are started up.
+     */
+    public void pause() {
+        if (topologyMetadata.hasNamedTopologies()) {
+            for (final NamedTopology namedTopology : topologyMetadata.getAllNamedTopologies()) {
+                topologyMetadata.pauseTopology(namedTopology.name());
+            }
+        } else {
+            topologyMetadata.pauseTopology(UNNAMED_TOPOLOGY);
+        }
+    }
+
+    /**
+     * @return true when the KafkaStreams instance has its processing paused.
+     */
+    public boolean isPaused() {
+        if (topologyMetadata.hasNamedTopologies()) {
+            return topologyMetadata.getAllNamedTopologies().stream()
+                .map(NamedTopology::name)
+                .allMatch(topologyMetadata::isPaused);
+        } else {
+            return topologyMetadata.isPaused(UNNAMED_TOPOLOGY);
+        }
+    }
+
+    /**
+     * This method resumes processing for the KafkaStreams instance.
+     */
+    public void resume() {
+        if (topologyMetadata.hasNamedTopologies()) {
+            for (final NamedTopology namedTopology : topologyMetadata.getAllNamedTopologies()) {
+                topologyMetadata.resumeTopology(namedTopology.name());
+            }
+        } else {
+            topologyMetadata.resumeTopology(UNNAMED_TOPOLOGY);
+        }
+    }
+
     /**
      * handle each stream thread in a snapshot of threads.
      * noted: iteration over SynchronizedList is not thread safe so it must be manually synchronized. However, we may

diff --git a/streams/src/main/java/org/apache/kafka/streams/processor/internals/StoreChangelogReader.java b/streams/src/main/java/org/apache/kafka/streams/processor/internals/StoreChangelogReader.java
@@ -415,68 +415,111 @@ public Set<TopicPartition> completedChangelogs() {
     public void restore(final Map<TaskId, Task> tasks) {
         initializeChangelogs(tasks, registeredChangelogs());
 
-        if (!activeRestoringChangelogs().isEmpty() && state == ChangelogReaderState.STANDBY_UPDATING) {
-            throw new IllegalStateException("Should not be in standby updating state if there are still un-completed active changelogs");
+        if (!activeRestoringChangelogs().isEmpty()
+            && state == ChangelogReaderState.STANDBY_UPDATING) {
+            throw new IllegalStateException(
+                "Should not be in standby updating state if there are still un-completed active changelogs");
         }
 
         if (allChangelogsCompleted()) {
             log.debug("Finished restoring all changelogs {}", changelogs.keySet());
-            return;
-        }
+        } else {
+            final Set<TopicPartition> restoringChangelogs = restoringChangelogs();
+            if (!restoringChangelogs.isEmpty()) {
+                final ConsumerRecords<byte[], byte[]> polledRecords;
 
-        final Set<TopicPartition> restoringChangelogs = restoringChangelogs();
-        if (!restoringChangelogs.isEmpty()) {
-            final ConsumerRecords<byte[], byte[]> polledRecords;
+                try {
+                    // for restoring active and updating standby we may prefer different poll time
+                    // in order to make sure we call the main consumer#poll in time.
+                    // TODO: once we move ChangelogReader to a separate thread this may no longer be a concern
+                    // JNH: Fix this?
+                    // Update state based on paused/resumed status.
+                    for (final TopicPartition partition : restoringChangelogs) {
+                        final TaskId taskId = changelogs.get(partition).stateManager.taskId();
+                        final Task task = tasks.get(taskId);
+                        if (task != null) {
+                            restoreConsumer.resume(Collections.singleton(partition));
+                        } else {
+                            restoreConsumer.pause(Collections.singleton(partition));
+                        }
+                    }
 
-            try {
-                // for restoring active and updating standby we may prefer different poll time
-                // in order to make sure we call the main consumer#poll in time.
-                // TODO: once we move ChangelogReader to a separate thread this may no longer be a concern
-                polledRecords = restoreConsumer.poll(state == ChangelogReaderState.STANDBY_UPDATING ? Duration.ZERO : pollTime);
-
-                // TODO (?) If we cannot fetch records during restore, should we trigger `task.timeout.ms` ?
-                // TODO (?) If we cannot fetch records for standby task, should we trigger `task.timeout.ms` ?
-            } catch (final InvalidOffsetException e) {
-                log.warn("Encountered " + e.getClass().getName() +
-                    " fetching records from restore consumer for partitions " + e.partitions() + ", it is likely that " +
-                    "the consumer's position has fallen out of the topic partition offset range because the topic was " +
-                    "truncated or compacted on the broker, marking the corresponding tasks as corrupted and re-initializing" +
-                    " it later.", e);
-
-                final Set<TaskId> corruptedTasks = new HashSet<>();
-                e.partitions().forEach(partition -> corruptedTasks.add(changelogs.get(partition).stateManager.taskId()));
-                throw new TaskCorruptedException(corruptedTasks, e);
-            } catch (final KafkaException e) {
-                throw new StreamsException("Restore consumer get unexpected error polling records.", e);
-            }
+                    polledRecords = restoreConsumer.poll(
+                        state == ChangelogReaderState.STANDBY_UPDATING ? Duration.ZERO : pollTime);
+
+                    // TODO (?) If we cannot fetch records during restore, should we trigger `task.timeout.ms` ?
+                    // TODO (?) If we cannot fetch records for standby task, should we trigger `task.timeout.ms` ?
+                } catch (final InvalidOffsetException e) {
+                    log.warn("Encountered " + e.getClass().getName() +
+                        " fetching records from restore consumer for partitions " + e.partitions()
+                        + ", it is likely that " +
+                        "the consumer's position has fallen out of the topic partition offset range because the topic was "
+                        +
+                        "truncated or compacted on the broker, marking the corresponding tasks as corrupted and re-initializing"
+                        +
+                        " it later.", e);
+
+                    final Set<TaskId> corruptedTasks = new HashSet<>();
+                    e.partitions().forEach(partition -> corruptedTasks.add(
+                        changelogs.get(partition).stateManager.taskId()));
+                    throw new TaskCorruptedException(corruptedTasks, e);
+                } catch (final KafkaException e) {
+                    throw new StreamsException(
+                        "Restore consumer get unexpected error polling records.", e);
+                }
 
-            for (final TopicPartition partition : polledRecords.partitions()) {
-                bufferChangelogRecords(restoringChangelogByPartition(partition), polledRecords.records(partition));
-            }
+                // JNH: Fix this?
+                for (final TopicPartition partition : polledRecords.partitions()) {
+                    bufferChangelogRecords(restoringChangelogByPartition(partition),
+                        polledRecords.records(partition));
+                }
 
-            for (final TopicPartition partition : restoringChangelogs) {
-                // even if some partition do not have any accumulated data, we still trigger
-                // restoring since some changelog may not need to restore any at all, and the
-                // restore to end check needs to be executed still.
-                // TODO: we always try to restore as a batch when some records are accumulated, which may result in
-                //       small batches; this can be optimized in the future, e.g. wait longer for larger batches.
-                final TaskId taskId = changelogs.get(partition).stateManager.taskId();
-                try {
-                    if (restoreChangelog(changelogs.get(partition))) {
-                        tasks.get(taskId).clearTaskTimeout();
+                for (final TopicPartition partition : restoringChangelogs) {
+                    // even if some partition do not have any accumulated data, we still trigger
+                    // restoring since some changelog may not need to restore any at all, and the
+                    // restore to end check needs to be executed still.
+                    // TODO: we always try to restore as a batch when some records are accumulated, which may result in
+                    //       small batches; this can be optimized in the future, e.g. wait longer for larger batches.
+                    final TaskId taskId = changelogs.get(partition).stateManager.taskId();
+                    // JNH: Need to revisit
+                /* Skip over paused tasks
+                final Task task = tasks.get(taskId);
+                if (task != null) {
+                    try {
+                        if (restoreChangelog(changelogs.get(partition))) {
+                            task.clearTaskTimeout();
+                        }
+                    } catch (final TimeoutException timeoutException) {
+                        tasks.get(taskId).maybeInitTaskTimeoutOrThrow(
+                            time.milliseconds(),
+                            timeoutException
+                        );
+                    }
+                }
+                */
+
+                    // Read from all topics.
+                    try {
+                        if (restoreChangelog(changelogs.get(partition))) {
+                            final Task task = tasks.get(taskId);
+                            if (task != null) {
+                                task.clearTaskTimeout();
+                            }
+                        }
+                    } catch (final TimeoutException timeoutException) {
+                        tasks.get(taskId).maybeInitTaskTimeoutOrThrow(
+                            time.milliseconds(),
+                            timeoutException
+                        );
                     }
-                } catch (final TimeoutException timeoutException) {
-                    tasks.get(taskId).maybeInitTaskTimeoutOrThrow(
-                        time.milliseconds(),
-                        timeoutException
-                    );
                 }
-            }
 
-            maybeUpdateLimitOffsetsForStandbyChangelogs(tasks);
+                maybeUpdateLimitOffsetsForStandbyChangelogs(tasks);
 
-            maybeLogRestorationProgress();
+                maybeLogRestorationProgress();
+            }
         }
+
     }
 
     private void maybeLogRestorationProgress() {
@@ -633,7 +676,11 @@ private Set<Task> getTasksFromPartitions(final Map<TaskId, Task> tasks,
     }
 
     private void clearTaskTimeout(final Set<Task> tasks) {
-        tasks.forEach(Task::clearTaskTimeout);
+        tasks.forEach(t -> {
+            if (t != null) {
+                t.clearTaskTimeout();
+            }
+        });
     }
 
     private void maybeInitTaskTimeoutOrThrow(final Set<Task> tasks,

diff --git a/streams/src/main/java/org/apache/kafka/streams/processor/internals/StreamThread.java b/streams/src/main/java/org/apache/kafka/streams/processor/internals/StreamThread.java
@@ -897,7 +897,8 @@ private void initializeAndRestorePhase() {
         }
         // we can always let changelog reader try restoring in order to initialize the changelogs;
         // if there's no active restoring or standby updating it would not try to fetch any data
-        changelogReader.restore(taskManager.tasks());
+        // After KAFKA-13873, we only restore the not paused tasks.
+        changelogReader.restore(taskManager.notPausedTasks());
         log.debug("Idempotent restore call done. Thread state has not changed.");
     }
 

diff --git a/...ams/src/main/java/org/apache/kafka/streams/processor/internals/TaskExecutionMetadata.java b/...ams/src/main/java/org/apache/kafka/streams/processor/internals/TaskExecutionMetadata.java
@@ -35,21 +35,27 @@ public class TaskExecutionMetadata {
     private static final long CONSTANT_BACKOFF_MS = 5_000L;
 
     private final boolean hasNamedTopologies;
+    private final Set<String> pausedTopologies;
     // map of topologies experiencing errors/currently under backoff
     private final ConcurrentHashMap<String, NamedTopologyMetadata> topologyNameToErrorMetadata = new ConcurrentHashMap<>();
 
-    public TaskExecutionMetadata(final Set<String> allTopologyNames) {
+    public TaskExecutionMetadata(final Set<String> allTopologyNames, final Set<String> pausedTopologies) {
         this.hasNamedTopologies = !(allTopologyNames.size() == 1 && allTopologyNames.contains(UNNAMED_TOPOLOGY));
+        this.pausedTopologies = pausedTopologies;
     }
 
     public boolean canProcessTask(final Task task, final long now) {
         final String topologyName = task.id().topologyName();
         if (!hasNamedTopologies) {
             // TODO implement error handling/backoff for non-named topologies (needs KIP)
-            return true;
+            return !pausedTopologies.contains(UNNAMED_TOPOLOGY);
         } else {
-            final NamedTopologyMetadata metadata = topologyNameToErrorMetadata.get(topologyName);
-            return metadata == null || (metadata.canProcess() && metadata.canProcessTask(task, now));
+            if (pausedTopologies.contains(topologyName)) {
+                return false;
+            } else {
+                final NamedTopologyMetadata metadata = topologyNameToErrorMetadata.get(topologyName);
+                return metadata == null || (metadata.canProcess() && metadata.canProcessTask(task, now));
+            }
         }
     }
 

diff --git a/streams/src/main/java/org/apache/kafka/streams/processor/internals/TaskExecutor.java b/streams/src/main/java/org/apache/kafka/streams/processor/internals/TaskExecutor.java
@@ -275,7 +275,7 @@ private void commitSuccessfullyProcessedTasks() {
     int punctuate() {
         int punctuated = 0;
 
-        for (final Task task : tasks.activeTasks()) {
+        for (final Task task : tasks.notPausedTasks()) {
             try {
                 if (task.maybePunctuateStreamTime()) {
                     punctuated++;

diff --git a/streams/src/main/java/org/apache/kafka/streams/processor/internals/TaskManager.java b/streams/src/main/java/org/apache/kafka/streams/processor/internals/TaskManager.java
@@ -1018,6 +1018,11 @@ Map<TaskId, Task> tasks() {
         return tasks.tasksPerId();
     }
 
+    Map<TaskId, Task> notPausedTasks() {
+        return Collections.unmodifiableMap(tasks.notPausedTasks().stream()
+            .collect(Collectors.toMap(Task::id, v -> v)));
+    }
+
     Map<TaskId, Task> activeTaskMap() {
         return activeTaskStream().collect(Collectors.toMap(Task::id, t -> t));
     }

diff --git a/streams/src/main/java/org/apache/kafka/streams/processor/internals/Tasks.java b/streams/src/main/java/org/apache/kafka/streams/processor/internals/Tasks.java
@@ -16,6 +16,7 @@
  */
 package org.apache.kafka.streams.processor.internals;
 
+import java.util.ArrayList;
 import org.apache.kafka.clients.consumer.Consumer;
 import org.apache.kafka.common.Metric;
 import org.apache.kafka.common.MetricName;
@@ -273,6 +274,13 @@ Collection<Task> allTasks() {
         return readOnlyTasks;
     }
 
+    Collection<Task> notPausedTasks() {
+        return new ArrayList<>(readOnlyTasks)
+            .stream()
+            .filter(t -> !topologyMetadata.isPaused(t.id().topologyName()))
+            .collect(Collectors.toList());
+    }
+
     Set<TaskId> activeTaskIds() {
         return readOnlyActiveTaskIds;
     }