Skip to content

Commit 2f29dbd

Browse files
authored
[b/356461225] Add progress during HDFS dumping (#673)
* [b/356461225] Add progress during HDFS dumping * Fix HDFS extraction stats, simplify ScanContext & SingleDirScanJob * Separate PROGRESS logging vs DEBUG logging (see ScanContext.PROGRESS_DEBUG_STATS) * Address review comments
1 parent 50d615c commit 2f29dbd

File tree

5 files changed

+224
-48
lines changed

5 files changed

+224
-48
lines changed

dumper/app/src/main/java/com/google/edwmigration/dumper/application/dumper/ConnectorArguments.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,14 @@ public class ConnectorArguments extends DefaultArguments {
141141
public static final String OPT_HADOOP_RPC_PROTECTION = "hadoop-rpc-protection";
142142

143143
public static final String OPT_HDFS_PRINCIPAL_PREFIX = "hdfs-principal-prefix";
144+
145+
public static final String OPT_HDFS_SCAN_ROOT_PATH = "hdfs-scan-root-path";
144146
public static final String OPT_HADOOP_CORE_SITE_XML_DEFAULT =
145147
"/etc/hadoop/conf.cloudera.hdfs/core-site.xml";
146148
public static final String OPT_HADOOP_HDFS_SITE_XML_DEFAULT =
147149
"/etc/hadoop/conf.cloudera.hdfs/hdfs-site.xml";
148150
public static final String OPT_HDFS_PRINCIPAL_PREFIX_DEFAULT = "hdfs/_HOST@";
151+
public static final String OPT_HDFS_SCAN_ROOT_PATH_DEFAULT = "/";
149152
// Ranger.
150153
public static final String OPT_RANGER_PORT_DEFAULT = "6080";
151154
public static final String OPT_RANGER_PAGE_SIZE = "ranger-page-size";
@@ -484,6 +487,13 @@ public class ConnectorArguments extends DefaultArguments {
484487
.ofType(String.class)
485488
.defaultsTo(OPT_HDFS_PRINCIPAL_PREFIX_DEFAULT);
486489

490+
private final OptionSpec<String> optionHdfsScanRootPath =
491+
parser
492+
.accepts(OPT_HDFS_SCAN_ROOT_PATH, "HDFS root path to be scanned recursively.")
493+
.withRequiredArg()
494+
.ofType(String.class)
495+
.defaultsTo(OPT_HDFS_SCAN_ROOT_PATH_DEFAULT);
496+
487497
public final OptionSpec<Void> optionKerberosAuthForHadoop =
488498
parser
489499
.accepts(OPT_KERBEROS_AUTH_FOR_HADOOP, "Use Kerberos auth for Hadoop.")
@@ -1004,6 +1014,10 @@ public String getHdfsPrincipalPrefix() {
10041014
return getOptions().valueOf(optionHdfsPrincipalPrefix);
10051015
}
10061016

1017+
public String getHdfsScanRootPath() {
1018+
return getOptions().valueOf(optionHdfsScanRootPath);
1019+
}
1020+
10071021
@CheckForNull
10081022
public String getGenericQuery() {
10091023
return getOptions().valueOf(optionGenericQuery);

dumper/app/src/main/java/com/google/edwmigration/dumper/application/dumper/TasksRunner.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
public class TasksRunner implements TaskRunContextOps {
4646

4747
private static final Logger LOG = LoggerFactory.getLogger(TasksRunner.class);
48-
private static final Logger PROGRESS_LOG = LoggerFactory.getLogger("progress-logger");
48+
public static final Logger PROGRESS_LOG = LoggerFactory.getLogger("progress-logger");
4949

5050
private AtomicInteger numberOfCompletedTasks;
5151
private final int totalNumberOfTasks;

dumper/app/src/main/java/com/google/edwmigration/dumper/application/dumper/connector/hdfs/HdfsExtractionTask.java

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
*/
1717
package com.google.edwmigration.dumper.application.dumper.connector.hdfs;
1818

19+
import static com.google.edwmigration.dumper.application.dumper.ConnectorArguments.OPT_HDFS_SCAN_ROOT_PATH;
20+
import static com.google.edwmigration.dumper.application.dumper.ConnectorArguments.OPT_THREAD_POOL_SIZE;
21+
import static com.google.edwmigration.dumper.application.dumper.connector.hdfs.SingleDirScanJob.trimExceptionMessage;
1922
import static java.lang.String.format;
2023
import static java.nio.charset.StandardCharsets.UTF_8;
2124

@@ -35,6 +38,7 @@
3538
import java.util.concurrent.ExecutorService;
3639
import java.util.concurrent.TimeUnit;
3740
import javax.annotation.Nonnull;
41+
import org.apache.hadoop.fs.ContentSummary;
3842
import org.apache.hadoop.fs.FileStatus;
3943
import org.apache.hadoop.fs.Path;
4044
import org.apache.hadoop.hdfs.DistributedFileSystem;
@@ -44,12 +48,17 @@
4448
public class HdfsExtractionTask extends AbstractTask<Void> implements HdfsExtractionDumpFormat {
4549
private static final Logger LOG = LoggerFactory.getLogger(HdfsExtractionTask.class);
4650

47-
private final int poolSize;
51+
private final int threadPoolSize;
52+
private final String hdfsScanRootPath;
4853

4954
HdfsExtractionTask(@Nonnull ConnectorArguments args) {
5055
super(HdfsFormat.ZIP_ENTRY_NAME);
51-
Preconditions.checkNotNull(args, "Arguments was null.");
52-
poolSize = args.getThreadPoolSize();
56+
threadPoolSize = args.getThreadPoolSize();
57+
Preconditions.checkArgument(
58+
threadPoolSize > 0, "Argument %s should be positive number", OPT_THREAD_POOL_SIZE);
59+
hdfsScanRootPath = args.getHdfsScanRootPath();
60+
Preconditions.checkArgument(
61+
!hdfsScanRootPath.isEmpty(), "Argument %s should be non-empty", OPT_HDFS_SCAN_ROOT_PATH);
5362
}
5463

5564
@Override
@@ -74,21 +83,38 @@ protected Void doRun(TaskRunContext context, @Nonnull ByteSink sink, @Nonnull Ha
7483
DistributedFileSystem fs = ((HdfsHandle) handle).getDfs();
7584
// Create a dedicated ExecutorService to use:
7685
ExecutorService execService =
77-
ExecutorManager.newExecutorServiceWithBackpressure("hdfs-extraction", poolSize);
86+
ExecutorManager.newExecutorServiceWithBackpressure("hdfs-extraction", threadPoolSize);
7887
try (Writer output = sink.asCharSink(UTF_8).openBufferedStream();
79-
ScanContext scanCtx = new ScanContext(fs, output);
80-
ExecutorManager execManager = new ExecutorManager(execService)) {
88+
ExecutorManager execManager = new ExecutorManager(execService);
89+
ScanContext scanCtx = new ScanContext(execManager, fs, output)) {
8190

82-
String hdfsPath = "/";
83-
FileStatus rootDir = fs.getFileStatus(new Path(hdfsPath));
84-
SingleDirScanJob rootJob = new SingleDirScanJob(scanCtx, execManager, rootDir);
85-
execManager.execute(rootJob); // The root job executes immediately
91+
LOG.info(
92+
"Running HDFS extraction\n\t{}: {}\n\t{}: {}",
93+
OPT_HDFS_SCAN_ROOT_PATH,
94+
hdfsScanRootPath,
95+
OPT_THREAD_POOL_SIZE,
96+
threadPoolSize);
97+
FileStatus rootDir = fs.getFileStatus(new Path(hdfsScanRootPath));
98+
scanCtx.submitRootDirScanJob(rootDir, getContentSummaryFor(fs, rootDir));
8699
execManager.await(); // Wait until all (recursive) tasks are done executing
87-
LOG.info(scanCtx.getFormattedStats());
100+
LOG.info("Final stats:\n{}", scanCtx.getDetailedStats());
88101
} finally {
89102
// Shutdown the dedicated ExecutorService:
90103
MoreExecutors.shutdownAndAwaitTermination(execService, 100, TimeUnit.MILLISECONDS);
91104
}
92105
return null;
93106
}
107+
108+
private ContentSummary getContentSummaryFor(DistributedFileSystem dfs, FileStatus file) {
109+
try {
110+
return dfs.getContentSummary(file.getPath());
111+
} catch (org.apache.hadoop.security.AccessControlException exn) {
112+
LOG.info(
113+
"Progress for HDFS extraction won't be displayed due to AccessControlException: {}",
114+
trimExceptionMessage(exn.getMessage()));
115+
} catch (IOException exn) {
116+
LOG.error("Progress for HDFS extraction won't be displayed due to IOException: ", exn);
117+
}
118+
return null;
119+
}
94120
}

0 commit comments

Comments
 (0)