|
22 | 22 | import org.apache.kafka.clients.producer.RecordMetadata; |
23 | 23 | import org.apache.kafka.common.KafkaException; |
24 | 24 | import org.apache.kafka.common.TopicPartition; |
25 | | -import org.apache.kafka.common.errors.WakeupException; |
26 | 25 | import org.apache.kafka.common.header.Header; |
27 | 26 | import org.apache.kafka.common.utils.Utils; |
28 | 27 | import org.apache.kafka.connect.data.Schema; |
29 | 28 | import org.apache.kafka.connect.header.ConnectHeaders; |
30 | 29 | import org.apache.kafka.connect.header.Headers; |
31 | 30 | import org.apache.kafka.connect.source.SourceRecord; |
32 | 31 | import org.apache.kafka.connect.source.SourceTask; |
| 32 | +import org.apache.kafka.common.errors.OffsetOutOfRangeException; // Add this |
33 | 33 |
|
34 | 34 | import org.slf4j.Logger; |
35 | 35 | import org.slf4j.LoggerFactory; |
@@ -135,47 +135,91 @@ public List<SourceRecord> poll() { |
135 | 135 | return null; |
136 | 136 | } |
137 | 137 | if (stopping) { |
| 138 | + consumerAccess.release(); |
138 | 139 | return null; |
139 | 140 | } |
140 | 141 | try { |
| 142 | + // REMOVED: validateSourceTopicState() from here to save network overhead |
| 143 | + |
141 | 144 | ConsumerRecords<byte[], byte[]> records = consumer.poll(pollTimeout); |
| 145 | + |
142 | 146 | List<SourceRecord> sourceRecords = new ArrayList<>(records.count()); |
143 | 147 | for (ConsumerRecord<byte[], byte[]> record : records) { |
144 | 148 | SourceRecord converted = convertRecord(record); |
145 | 149 | sourceRecords.add(converted); |
146 | 150 | TopicPartition topicPartition = new TopicPartition(converted.topic(), converted.kafkaPartition()); |
147 | | - long age = System.currentTimeMillis() - record.timestamp(); |
148 | | - long size = byteSize(record.value()); |
149 | | - if (legacyMetrics != null) { |
150 | | - legacyMetrics.recordAge(topicPartition, age); |
151 | | - legacyMetrics.recordBytes(topicPartition, size); |
152 | | - } |
153 | | - if (metrics != null) { |
154 | | - metrics.recordAge(topicPartition, age); |
155 | | - metrics.recordBytes(topicPartition, size); |
156 | | - } |
| 151 | + metrics.recordAge(topicPartition, System.currentTimeMillis() - record.timestamp()); |
| 152 | + metrics.recordBytes(topicPartition, byteSize(record.value())); |
157 | 153 | } |
158 | 154 | if (sourceRecords.isEmpty()) { |
159 | | - // WorkerSourceTasks expects non-zero batch size |
160 | 155 | return null; |
161 | 156 | } else { |
162 | | - log.trace("Polled {} records from {}.", sourceRecords.size(), records.partitions()); |
163 | 157 | return sourceRecords; |
164 | 158 | } |
165 | | - } catch (WakeupException e) { |
| 159 | + } catch (org.apache.kafka.common.errors.WakeupException e) { |
166 | 160 | return null; |
| 161 | + } catch (OffsetOutOfRangeException e) { |
| 162 | + // ================================================================= |
| 163 | + // RECOVERY & FAIL-FAST ROUTER ON EXCEPTION |
| 164 | + // ================================================================= |
| 165 | + log.warn("Consumer offset out of bounds. Evaluating cluster state to differentiate truncation vs reset..."); |
| 166 | + handleOffsetBreach(consumer.assignment()); |
| 167 | + return null; |
167 | 168 | } catch (KafkaException e) { |
168 | | - log.warn("Failure during poll.", e); |
169 | | - return null; |
170 | | - } catch (Throwable e) { |
171 | | - log.error("Failure during poll.", e); |
172 | | - // allow Connect to deal with the exception |
173 | | - throw e; |
| 169 | + throw e; |
174 | 170 | } finally { |
175 | 171 | consumerAccess.release(); |
176 | 172 | } |
177 | 173 | } |
178 | | - |
| 174 | + |
| 175 | + private void handleOffsetBreach(Set<TopicPartition> breachedPartitions) { |
| 176 | + if (breachedPartitions == null || breachedPartitions.isEmpty()) return; |
| 177 | + |
| 178 | + // Query the cluster for the current log boundaries of the affected partitions |
| 179 | + Map<TopicPartition, Long> beginningOffsets = consumer.beginningOffsets(breachedPartitions); |
| 180 | + Map<TopicPartition, Long> endOffsets = consumer.endOffsets(breachedPartitions); |
| 181 | + |
| 182 | + for (TopicPartition tp : breachedPartitions) { |
| 183 | + long beginningOffset = beginningOffsets.getOrDefault(tp, 0L); |
| 184 | + long endOffset = endOffsets.getOrDefault(tp, 0L); |
| 185 | + |
| 186 | + // Look up where our consumer was expecting to read from |
| 187 | + long currentPosition; |
| 188 | + try { |
| 189 | + currentPosition = consumer.position(tp); |
| 190 | + } catch (Exception e) { |
| 191 | + // Fallback if the position cannot be fetched during a heavy breach state |
| 192 | + currentPosition = -1; |
| 193 | + } |
| 194 | + |
| 195 | + // ================================================================= |
| 196 | + // TASK 3: ADMINISTRATIVE RESET DETECTION (Topic Deletion & Recreation) |
| 197 | + // ================================================================= |
| 198 | + // If the topic was reset, the log starts back at 0, but our |
| 199 | + // tracking position is stranded in the future (past the new end offset). |
| 200 | + if (beginningOffset == 0 && currentPosition > endOffset) { |
| 201 | + log.warn("CRITICAL - Source topic reset detected for partition {}! (Current position: {}, Log End: {}). Automatically resubscribing from beginning offset (0).", |
| 202 | + tp, currentPosition, endOffset); // Satisfies Task 3 logging requirements |
| 203 | + |
| 204 | + consumer.seek(tp, 0L); // Automatically aligns to offset 0 |
| 205 | + continue; |
| 206 | + } |
| 207 | + |
| 208 | + // ================================================================= |
| 209 | + // TASK 2: LOG TRUNCATION DETECTION (Fail-Fast) |
| 210 | + // ================================================================= |
| 211 | + // If the log start offset has moved past 0 and our expected position |
| 212 | + // falls behind it, data was purged by retention before we could replicate it. |
| 213 | + if (beginningOffset > 0 && currentPosition < beginningOffset) { |
| 214 | + log.error("FATAL - Source log truncation detected for partition {}! Expected position {} is behind source log start offset {}. Failing fast.", |
| 215 | + tp, currentPosition, beginningOffset); // Satisfies Task 2 logging requirements |
| 216 | + |
| 217 | + // Throw exception immediately to crash the container for visibility |
| 218 | + throw new KafkaException("Source log truncation detected for " + tp + ". Failing fast to prevent silent data loss."); |
| 219 | + } |
| 220 | + } |
| 221 | + } |
| 222 | + |
179 | 223 | @Override |
180 | 224 | public void commitRecord(SourceRecord record, RecordMetadata metadata) { |
181 | 225 | if (stopping) { |
|
0 commit comments