By default dont allow index_hadoop tasks to run on a cluster, forcing operators to acknowledge that they are using a deprecated feature (apache#18239)

capistrant · web-flow · commit 2d563de8de9b · 2025-07-18T21:34:12.000+05:30
* By default dont allow index_hadoop tasks to run on a cluster, forcing operators to acknolwedge that they are using a deprecated feature

* update unclear recommendation from log

* Fixup codeql warning

* fix UT
diff --git a/docs/configuration/index.md b/docs/configuration/index.md
@@ -1414,6 +1414,7 @@ Additional Peon configs include:
 |`druid.indexer.task.ignoreTimestampSpecForDruidInputSource`|If true, tasks using the [Druid input source](../ingestion/input-sources.md) will ignore the provided timestampSpec, and will use the `__time` column of the input datasource. This option is provided for compatibility with ingestion specs written before Druid 0.22.0.|false|
 |`druid.indexer.task.storeEmptyColumns`|Boolean value for whether or not to store empty columns during ingestion. When set to true, Druid stores every column specified in the [`dimensionsSpec`](../ingestion/ingestion-spec.md#dimensionsspec). If you use the string-based schemaless ingestion and don't specify any dimensions to ingest, you must also set [`includeAllDimensions`](../ingestion/ingestion-spec.md#dimensionsspec) for Druid to store empty columns.<br/><br/>If you set `storeEmptyColumns` to false, Druid SQL queries referencing empty columns will fail. If you intend to leave `storeEmptyColumns` disabled, you should either ingest placeholder data for empty columns or else not query on empty columns.<br/><br/>You can overwrite this configuration  by setting `storeEmptyColumns` in the [task context](../ingestion/tasks.md#context-parameters).|true|
 |`druid.indexer.task.tmpStorageBytesPerTask`|Maximum number of bytes per task to be used to store temporary files on disk. This config is generally intended for internal usage. Attempts to set it are very likely to be overwritten by the TaskRunner that executes the task, so be sure of what you expect to happen before directly adjusting this configuration parameter. The config is documented here primarily to provide an understanding of what it means if/when someone sees that it has been set. A value of -1 disables this limit.  |-1|
+|`druid.indexer.task.allowHadoopTaskExecution`|Conditional dictating if the cluster allows `index_hadoop` tasks to be executed. `index_hadoop` is deprecated, and defaulting to false will force cluster operators to acknowledge the deprecation and consciously opt in to using index_hadoop with the understanding that it will be removed in the future.|false|
 |`druid.indexer.server.maxChatRequests`|Maximum number of concurrent requests served by a task's chat handler. Set to 0 to disable limiting.|0|
 
 If the Peon is running in remote mode, there must be an Overlord up and running. Peons in remote mode can set the following configurations:
diff --git a/indexing-hadoop/src/main/java/org/apache/druid/indexer/HadoopIndexTask.java b/indexing-hadoop/src/main/java/org/apache/druid/indexer/HadoopIndexTask.java
@@ -295,6 +295,22 @@ public TaskStatus runTask(TaskToolbox toolbox)
   {
     try {
       taskConfig = toolbox.getConfig();
+      if (!taskConfig.isAllowHadoopTaskExecution()) {
+        errorMsg = StringUtils.format(
+            "Hadoop tasks are deprecated and will be removed in a future release. "
+            + "Currently, they are not allowed to run on this cluster. If you wish to run them despite deprecation, "
+            + "please set [%s] to true.",
+            TaskConfig.ALLOW_HADOOP_TASK_EXECUTION_KEY
+        );
+        log.error(errorMsg);
+        toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports());
+        return TaskStatus.failure(getId(), errorMsg);
+      }
+      log.warn("Running deprecated index_hadoop task [%s]. "
+              + "Hadoop batch indexing is deprecated and will be removed in a future release. "
+              + "Please plan your migration to one of Druid's supported indexing patterns.",
+          getId()
+      );
       if (chatHandlerProvider.isPresent()) {
         log.info("Found chat handler of class[%s]", chatHandlerProvider.get().getClass().getName());
         chatHandlerProvider.get().register(getId(), this, false);
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopIndexTaskTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopIndexTaskTest.java
@@ -23,6 +23,9 @@
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableMap;
 import org.apache.druid.indexer.granularity.UniformGranularitySpec;
+import org.apache.druid.indexing.common.TaskToolbox;
+import org.apache.druid.indexing.common.config.TaskConfigBuilder;
+import org.apache.druid.indexing.overlord.TestTaskToolboxFactory;
 import org.apache.druid.jackson.DefaultObjectMapper;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
@@ -41,6 +44,45 @@ public class HadoopIndexTaskTest
 {
   private final ObjectMapper jsonMapper = new DefaultObjectMapper();
 
+  @Test
+  public void testHadoopTaskWontRunWithDefaultTaskConfig()
+  {
+    final HadoopIndexTask task = new HadoopIndexTask(
+        null,
+        new HadoopIngestionSpec(
+            DataSchema.builder()
+                      .withDataSource("foo")
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              null,
+                              ImmutableList.of(Intervals.of("2010-01-01/P1D"))
+                          )
+                      )
+                      .withObjectMapper(jsonMapper)
+                      .build(),
+            new HadoopIOConfig(ImmutableMap.of("paths", "bar"), null, null),
+            null
+        ),
+        null,
+        null,
+        "blah",
+        jsonMapper,
+        null,
+        AuthTestUtils.TEST_AUTHORIZER_MAPPER,
+        null,
+        new HadoopTaskConfig(null, null)
+    );
+
+    TestTaskToolboxFactory.Builder builder = new TestTaskToolboxFactory.Builder().setConfig(new TaskConfigBuilder().build());
+    TaskToolbox toolbox = new TestTaskToolboxFactory(builder).build(task);
+
+    Assert.assertEquals("Hadoop tasks are deprecated and will be removed in a future release. Currently, "
+                        + "they are not allowed to run on this cluster. If you wish to run them despite deprecation, "
+                        + "please set [druid.indexer.task.allowHadoopTaskExecution] to true.",
+                        task.runTask(toolbox).getErrorMsg());
+  }
+
   @Test
   public void testCorrectInputSourceResources()
   {
diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/config/TaskConfig.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/config/TaskConfig.java
@@ -41,6 +41,7 @@
  */
 public class TaskConfig implements TaskDirectory
 {
+  public static final String ALLOW_HADOOP_TASK_EXECUTION_KEY = "druid.indexer.task.allowHadoopTaskExecution";
   private static final Period DEFAULT_DIRECTORY_LOCK_TIMEOUT = new Period("PT10M");
   private static final Period DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT = new Period("PT5M");
   private static final boolean DEFAULT_STORE_EMPTY_COLUMNS = true;
@@ -76,6 +77,9 @@ public class TaskConfig implements TaskDirectory
   @JsonProperty
   private final long tmpStorageBytesPerTask;
 
+  @JsonProperty
+  private final boolean allowHadoopTaskExecution;
+
   @JsonCreator
   public TaskConfig(
       @JsonProperty("baseDir") String baseDir,
@@ -87,7 +91,8 @@ public TaskConfig(
       @JsonProperty("ignoreTimestampSpecForDruidInputSource") boolean ignoreTimestampSpecForDruidInputSource,
       @JsonProperty("storeEmptyColumns") @Nullable Boolean storeEmptyColumns,
       @JsonProperty("encapsulatedTask") boolean enableTaskLevelLogPush,
-      @JsonProperty("tmpStorageBytesPerTask") @Nullable Long tmpStorageBytesPerTask
+      @JsonProperty("tmpStorageBytesPerTask") @Nullable Long tmpStorageBytesPerTask,
+      @JsonProperty("allowHadoopTaskExecution") boolean allowHadoopTaskExecution
   )
   {
     this.baseDir = Configs.valueOrDefault(baseDir, System.getProperty("java.io.tmpdir"));
@@ -113,6 +118,7 @@ public TaskConfig(
 
     this.storeEmptyColumns = Configs.valueOrDefault(storeEmptyColumns, DEFAULT_STORE_EMPTY_COLUMNS);
     this.tmpStorageBytesPerTask = Configs.valueOrDefault(tmpStorageBytesPerTask, DEFAULT_TMP_STORAGE_BYTES_PER_TASK);
+    this.allowHadoopTaskExecution = allowHadoopTaskExecution;
   }
 
   private TaskConfig(
@@ -125,7 +131,8 @@ private TaskConfig(
       boolean ignoreTimestampSpecForDruidInputSource,
       boolean storeEmptyColumns,
       boolean encapsulatedTask,
-      long tmpStorageBytesPerTask
+      long tmpStorageBytesPerTask,
+      boolean allowHadoopTaskExecution
   )
   {
     this.baseDir = baseDir;
@@ -138,6 +145,7 @@ private TaskConfig(
     this.storeEmptyColumns = storeEmptyColumns;
     this.encapsulatedTask = encapsulatedTask;
     this.tmpStorageBytesPerTask = tmpStorageBytesPerTask;
+    this.allowHadoopTaskExecution = allowHadoopTaskExecution;
   }
 
   @JsonProperty
@@ -230,6 +238,12 @@ public long getTmpStorageBytesPerTask()
     return tmpStorageBytesPerTask;
   }
 
+  @JsonProperty
+  public boolean isAllowHadoopTaskExecution()
+  {
+    return allowHadoopTaskExecution;
+  }
+
   private String defaultDir(@Nullable String configParameter, final String defaultVal)
   {
     if (configParameter == null) {
@@ -251,7 +265,8 @@ public TaskConfig withBaseTaskDir(File baseTaskDir)
         ignoreTimestampSpecForDruidInputSource,
         storeEmptyColumns,
         encapsulatedTask,
-        tmpStorageBytesPerTask
+        tmpStorageBytesPerTask,
+        allowHadoopTaskExecution
     );
   }
 
@@ -267,7 +282,8 @@ public TaskConfig withTmpStorageBytesPerTask(long tmpStorageBytesPerTask)
         ignoreTimestampSpecForDruidInputSource,
         storeEmptyColumns,
         encapsulatedTask,
-        tmpStorageBytesPerTask
+        tmpStorageBytesPerTask,
+        allowHadoopTaskExecution
     );
   }
 }
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/config/TaskConfigBuilder.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/config/TaskConfigBuilder.java
@@ -36,6 +36,7 @@ public class TaskConfigBuilder
   private Boolean storeEmptyColumns;
   private boolean enableTaskLevelLogPush;
   private Long tmpStorageBytesPerTask;
+  private boolean allowHadoopTaskExecution;
 
   public TaskConfigBuilder setBaseDir(String baseDir)
   {
@@ -97,6 +98,12 @@ public TaskConfigBuilder setTmpStorageBytesPerTask(Long tmpStorageBytesPerTask)
     return this;
   }
 
+  public TaskConfigBuilder setAllowHadoopTaskExecution(boolean allowHadoopTaskExecution)
+  {
+    this.allowHadoopTaskExecution = allowHadoopTaskExecution;
+    return this;
+  }
+
   public TaskConfig build()
   {
     return new TaskConfig(
@@ -109,7 +116,8 @@ public TaskConfig build()
         ignoreTimestampSpecForDruidInputSource,
         storeEmptyColumns,
         enableTaskLevelLogPush,
-        tmpStorageBytesPerTask
+        tmpStorageBytesPerTask,
+        allowHadoopTaskExecution
     );
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,7 @@ public class TaskConfigBuilder`
`36`	`36`	`private Boolean storeEmptyColumns;`
`37`	`37`	`private boolean enableTaskLevelLogPush;`
`38`	`38`	`private Long tmpStorageBytesPerTask;`
	`39`	`+ private boolean allowHadoopTaskExecution;`
`39`	`40`
`40`	`41`	`public TaskConfigBuilder setBaseDir(String baseDir)`
`41`	`42`	`{`
`@@ -97,6 +98,12 @@ public TaskConfigBuilder setTmpStorageBytesPerTask(Long tmpStorageBytesPerTask)`
`97`	`98`	`return this;`
`98`	`99`	`}`
`99`	`100`
	`101`	`+ public TaskConfigBuilder setAllowHadoopTaskExecution(boolean allowHadoopTaskExecution)`
	`102`	`+ {`
	`103`	`+ this.allowHadoopTaskExecution = allowHadoopTaskExecution;`
	`104`	`+ return this;`
	`105`	`+ }`
	`106`	`+`
`100`	`107`	`public TaskConfig build()`
`101`	`108`	`{`
`102`	`109`	`return new TaskConfig(`
`@@ -109,7 +116,8 @@ public TaskConfig build()`
`109`	`116`	`ignoreTimestampSpecForDruidInputSource,`
`110`	`117`	`storeEmptyColumns,`
`111`	`118`	`enableTaskLevelLogPush,`
`112`		`- tmpStorageBytesPerTask`
	`119`	`+ tmpStorageBytesPerTask,`
	`120`	`+ allowHadoopTaskExecution`
`113`	`121`	`);`
`114`	`122`	`}`
`115`	`123`	`}`