Skip to content

Commit 8ac9721

Browse files
authored
HADOOP-19795: ABFS. GetPathStatus Optimization on OpenFileForRead (apache#8212)
Contributed by Manika Joshi
1 parent 25bd0dc commit 8ac9721

17 files changed

Lines changed: 1026 additions & 47 deletions

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AbfsConfiguration.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,10 @@ public class AbfsConfiguration{
627627
DefaultValue = DEFAULT_AZURE_READ_POLICY)
628628
private String abfsReadPolicy;
629629

630+
@BooleanConfigurationValidatorAnnotation(ConfigurationKey = FS_AZURE_RESTRICT_GPS_ON_OPENFILE,
631+
DefaultValue = DEFAULT_FS_AZURE_RESTRICT_GPS_ON_OPENFILE)
632+
private boolean restrictGpsOnOpenFile;
633+
630634
private String clientProvidedEncryptionKey;
631635
private String clientProvidedEncryptionKeySHA;
632636

@@ -1389,6 +1393,14 @@ public String getAbfsReadPolicy() {
13891393
return abfsReadPolicy;
13901394
}
13911395

1396+
/**
1397+
* Indicates whether GPS restriction on open file is enabled.
1398+
* @return true if GPS restriction is enabled on open file, false otherwise.
1399+
*/
1400+
public boolean shouldRestrictGpsOnOpenFile() {
1401+
return restrictGpsOnOpenFile;
1402+
}
1403+
13921404
/**
13931405
* Enum config to allow user to pick format of x-ms-client-request-id header
13941406
* @return tracingContextFormat config if valid, else default ALL_ID_FORMAT

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystem.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,21 @@ public FSDataInputStream open(final Path path, final int bufferSize) throws IOEx
402402
return open(path, Optional.empty());
403403
}
404404

405+
/**
406+
* Open a file for reading and return an {@link FSDataInputStream} that wraps
407+
* the underlying {@link InputStream}.
408+
*
409+
* Note: when the filesystem is configured with `restrictGpsOnOpenFile` enabled
410+
* (its disabled by default), existence check for the file path will be deferred
411+
* and will not occur during this open call; it will happen when the first read
412+
* is attempted on the returned stream.
413+
*
414+
* @param path the location of the file to open
415+
* @param parameters optional {@link OpenFileParameters} which can include
416+
* FileStatus, configuration, buffer size and mandatory keys
417+
* @return an {@link FSDataInputStream} wrapping the opened InputStream
418+
* @throws IOException if an I/O error occurs while opening the file
419+
*/
405420
private FSDataInputStream open(final Path path,
406421
final Optional<OpenFileParameters> parameters) throws IOException {
407422
statIncrement(CALL_OPEN);

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystemStore.java

Lines changed: 77 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@
151151
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.CHAR_STAR;
152152
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.CHAR_UNDERSCORE;
153153
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.DIRECTORY;
154+
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.EMPTY_STRING;
154155
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.FILE;
155156
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.ROOT_PATH;
156157
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.SINGLE_WHITE_SPACE;
@@ -162,6 +163,7 @@
162163
import static org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.INFINITE_LEASE_DURATION;
163164
import static org.apache.hadoop.fs.azurebfs.constants.FileSystemUriSchemes.ABFS_BLOB_DOMAIN_NAME;
164165
import static org.apache.hadoop.fs.azurebfs.constants.HttpHeaderConfigurations.X_MS_ENCRYPTION_CONTEXT;
166+
import static org.apache.hadoop.fs.azurebfs.services.AbfsErrors.ERR_OPENFILE_ON_DIRECTORY;
165167
import static org.apache.hadoop.fs.azurebfs.utils.UriUtils.isKeyForDirectorySet;
166168

167169
/**
@@ -553,7 +555,7 @@ public Hashtable<String, String> getPathStatus(final Path path,
553555

554556
/**
555557
* Creates an object of {@link ContextEncryptionAdapter}
556-
* from a file path. It calls {@link org.apache.hadoop.fs.azurebfs.services.AbfsClient
558+
* from a file path. It calls {@link org.apache.hadoop.fs.azurebfs.services.AbfsClient
557559
* #getPathStatus(String, boolean, TracingContext, EncryptionAdapter)} method to get
558560
* contextValue (x-ms-encryption-context) from the server. The contextValue is passed
559561
* to the constructor of EncryptionAdapter to create the required object of
@@ -866,6 +868,53 @@ public AbfsInputStream openFileForRead(final Path path,
866868
tracingContext);
867869
}
868870

871+
/**
872+
* Creates an exception indicating that openFileForRead was called on a directory.
873+
*
874+
* @return AbfsRestOperationException with PATH_NOT_FOUND error code and a message
875+
* indicating that openFileForRead must be used with files and not directories.
876+
*/
877+
private AbfsRestOperationException openFileForReadDirectoryException() {
878+
return new AbfsRestOperationException(
879+
AzureServiceErrorCode.PATH_NOT_FOUND.getStatusCode(),
880+
AzureServiceErrorCode.PATH_NOT_FOUND.getErrorCode(),
881+
ERR_OPENFILE_ON_DIRECTORY,
882+
null);
883+
}
884+
885+
/**
886+
* Opens a file for read and returns an {@link AbfsInputStream}.
887+
*
888+
* <p>
889+
* The method decides whether to call the server's GetPathStatus based on:
890+
* <ul>
891+
* <li>the supplied {@code parameters} (if it contains a {@link VersionedFileStatus}
892+
* with a valid encryption context when required),</li>
893+
* <li>the client's encryption type ({@link EncryptionType#ENCRYPTION_CONTEXT}), and</li>
894+
* <li>the configuration flag returned by {@link AbfsConfiguration#shouldRestrictGpsOnOpenFile()}.</li>
895+
* </ul>
896+
* If the encryption type is {@code ENCRYPTION_CONTEXT} the server-supplied
897+
* X-MS-ENCRYPTION-CONTEXT header will be required and used to construct a
898+
* {@link ContextProviderEncryptionAdapter}. If that header is missing a
899+
* {@link PathIOException} is thrown.
900+
* </p>
901+
*
902+
* <p>
903+
* Note: when {@link AbfsConfiguration#shouldRestrictGpsOnOpenFile()} is enabled,
904+
* the implementation won't do the GetPathStatus call. In that case, if the file does not
905+
* actually exist or read is attempted on a directory, {@code openFileForRead} will not fail immediately.
906+
* It will only be detected when the returned stream performs its first read, at which point an appropriate error will be raised.
907+
* </p>
908+
*
909+
* @param path the path to open (may be unqualified)
910+
* @param parameters optional {@link OpenFileParameters} that may include a {@link FileStatus}
911+
* (possibly a {@link VersionedFileStatus}) and other open parameters
912+
* @param statistics filesystem statistics to associate with the returned stream
913+
* @param tracingContext tracing context for remote calls
914+
* @return an {@link AbfsInputStream} for reading the file
915+
* @throws IOException on IO or server errors. A {@link PathIOException} is thrown when
916+
* an expected encryption context header is missing.
917+
*/
869918
public AbfsInputStream openFileForRead(Path path,
870919
final Optional<OpenFileParameters> parameters,
871920
final FileSystem.Statistics statistics, TracingContext tracingContext)
@@ -878,13 +927,13 @@ public AbfsInputStream openFileForRead(Path path,
878927
FileStatus fileStatus = parameters.map(OpenFileParameters::getStatus)
879928
.orElse(null);
880929
String relativePath = getRelativePath(path);
881-
String resourceType, eTag;
882-
long contentLength;
930+
String resourceType = EMPTY_STRING, eTag = EMPTY_STRING;
931+
long contentLength = 0;
883932
ContextEncryptionAdapter contextEncryptionAdapter = NoContextEncryptionAdapter.getInstance();
884933
/*
885934
* GetPathStatus API has to be called in case of:
886-
* 1. fileStatus is null or not an object of VersionedFileStatus: as eTag
887-
* would not be there in the fileStatus object.
935+
* 1. restrictGpsOnOpenFile config is disabled AND fileStatus is null or not
936+
* an object of VersionedFileStatus: as eTag would not be there in the fileStatus object.
888937
* 2. fileStatus is an object of VersionedFileStatus and the object doesn't
889938
* have encryptionContext field when client's encryptionType is
890939
* ENCRYPTION_CONTEXT.
@@ -908,19 +957,23 @@ public AbfsInputStream openFileForRead(Path path,
908957
getClient().getEncryptionContextProvider(), getRelativePath(path),
909958
encryptionContext.getBytes(StandardCharsets.UTF_8));
910959
}
911-
} else {
960+
}
961+
/*
962+
* If file created with ENCRYPTION_CONTEXT, irrespective of whether isRestrictGpsOnOpenFile config is enabled or not,
963+
* GetPathStatus API has to be called to get the encryptionContext from the response header
964+
*/
965+
else if (getClient().getEncryptionType() == EncryptionType.ENCRYPTION_CONTEXT
966+
|| !getAbfsConfiguration().shouldRestrictGpsOnOpenFile()) {
967+
912968
AbfsHttpOperation op = getClient().getPathStatus(relativePath, false,
913-
tracingContext, null).getResult();
914-
resourceType = getClient().checkIsDir(op) ? DIRECTORY : FILE;
915-
contentLength = extractContentLength(op);
916-
eTag = op.getResponseHeader(HttpHeaderConfigurations.ETAG);
969+
tracingContext, null).getResult();
917970
/*
918971
* For file created with ENCRYPTION_CONTEXT, client shall receive
919972
* encryptionContext from header field: X_MS_ENCRYPTION_CONTEXT.
920973
*/
921974
if (getClient().getEncryptionType() == EncryptionType.ENCRYPTION_CONTEXT) {
922975
final String fileEncryptionContext = op.getResponseHeader(
923-
HttpHeaderConfigurations.X_MS_ENCRYPTION_CONTEXT);
976+
HttpHeaderConfigurations.X_MS_ENCRYPTION_CONTEXT);
924977
if (fileEncryptionContext == null) {
925978
LOG.debug("EncryptionContext missing in GetPathStatus response");
926979
throw new PathIOException(path.toString(),
@@ -930,14 +983,20 @@ public AbfsInputStream openFileForRead(Path path,
930983
getClient().getEncryptionContextProvider(), getRelativePath(path),
931984
fileEncryptionContext.getBytes(StandardCharsets.UTF_8));
932985
}
986+
resourceType = getClient().checkIsDir(op) ? DIRECTORY : FILE;
987+
contentLength = extractContentLength(op);
988+
eTag = op.getResponseHeader(HttpHeaderConfigurations.ETAG);
989+
}
990+
/* The only remaining case is:
991+
* - restrictGpsOnOpenFile config is enabled with null/wrong FileStatus and encryptionType not as ENCRYPTION_CONTEXT
992+
* In this case, we don't need to call GetPathStatus API.
993+
*/
994+
else {
995+
// do nothing
933996
}
934997

935998
if (parseIsDirectory(resourceType)) {
936-
throw new AbfsRestOperationException(
937-
AzureServiceErrorCode.PATH_NOT_FOUND.getStatusCode(),
938-
AzureServiceErrorCode.PATH_NOT_FOUND.getErrorCode(),
939-
"openFileForRead must be used with files and not directories",
940-
null);
999+
throw openFileForReadDirectoryException();
9411000
}
9421001

9431002
perfInfo.registerSuccess(true);
@@ -1003,6 +1062,7 @@ AZURE_FOOTER_READ_BUFFER_SIZE, getAbfsConfiguration().getFooterReadBufferSize())
10031062
.withStreamStatistics(new AbfsInputStreamStatisticsImpl())
10041063
.withShouldReadBufferSizeAlways(getAbfsConfiguration().shouldReadBufferSizeAlways())
10051064
.withReadAheadBlockSize(getAbfsConfiguration().getReadAheadBlockSize())
1065+
.shouldRestrictGpsOnOpenFile(getAbfsConfiguration().shouldRestrictGpsOnOpenFile())
10061066
.withBufferedPreadDisabled(bufferedPreadDisabled)
10071067
.withEncryptionAdapter(contextEncryptionAdapter)
10081068
.withAbfsBackRef(fsBackRef)
@@ -1855,7 +1915,7 @@ private AbfsClientContext populateAbfsClientContext() {
18551915
.build();
18561916
}
18571917

1858-
public String getRelativePath(final Path path) {
1918+
public static String getRelativePath(final Path path) {
18591919
Preconditions.checkNotNull(path, "path");
18601920
String relPath = path.toUri().getPath();
18611921
if (relPath.isEmpty()) {

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/AbfsHttpConstants.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ public final class AbfsHttpConstants {
175175
* @see "https://learn.microsoft.com/en-us/azure/active-directory/managed-identities-azure-resources/how-to-use-vm-token#error-handling"
176176
*/
177177
public static final int HTTP_TOO_MANY_REQUESTS = 429;
178+
public static final int HTTP_INVALID_RANGE = 416;
178179

179180
public static final char CHAR_FORWARD_SLASH = '/';
180181
public static final char CHAR_EXCLAMATION_POINT = '!';

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/ConfigurationKeys.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,5 +627,11 @@ public static String containerProperty(String property, String fsName, String ac
627627
*/
628628
public static final String FS_AZURE_TAIL_LATENCY_MAX_RETRY_COUNT = "fs.azure.tail.latency.max.retry.count";
629629

630+
/**
631+
* If true, restricts GPS (getPathStatus) calls on openFileforRead
632+
* Default: false
633+
*/
634+
public static final String FS_AZURE_RESTRICT_GPS_ON_OPENFILE = "fs.azure.restrict.gps.on.openfile";
635+
630636
private ConfigurationKeys() {}
631637
}

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/FileSystemConfigurations.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,7 @@ public final class FileSystemConfigurations {
306306
public static final int MIN_FS_AZURE_TAIL_LATENCY_ANALYSIS_WINDOW_GRANULARITY = 1;
307307
public static final int DEFAULT_FS_AZURE_TAIL_LATENCY_PERCENTILE_COMPUTATION_INTERVAL_MILLIS = 500;
308308
public static final int DEFAULT_FS_AZURE_TAIL_LATENCY_MAX_RETRY_COUNT = 1;
309+
public static final boolean DEFAULT_FS_AZURE_RESTRICT_GPS_ON_OPENFILE = false;
309310

310311
private FileSystemConfigurations() {}
311312
}

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/HttpHeaderConfigurations.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ public final class HttpHeaderConfigurations {
3939
public static final String CONTENT_MD5 = "Content-MD5";
4040
public static final String CONTENT_TYPE = "Content-Type";
4141
public static final String RANGE = "Range";
42+
public static final String CONTENT_RANGE = "Content-Range";
4243
public static final String TRANSFER_ENCODING = "Transfer-Encoding";
4344
public static final String USER_AGENT = "User-Agent";
4445
public static final String X_HTTP_METHOD_OVERRIDE = "X-HTTP-Method-Override";

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/contracts/services/AzureServiceErrorCode.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import java.net.HttpURLConnection;
2222
import java.util.ArrayList;
2323
import java.util.List;
24+
25+
import org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants;
2426
import org.slf4j.Logger;
2527
import org.slf4j.LoggerFactory;
2628
import org.apache.hadoop.classification.InterfaceAudience;
@@ -66,6 +68,8 @@ public enum AzureServiceErrorCode {
6668
INVALID_APPEND_OPERATION("InvalidAppendOperation", HttpURLConnection.HTTP_CONFLICT, null),
6769
UNAUTHORIZED_BLOB_OVERWRITE("UnauthorizedBlobOverwrite", HttpURLConnection.HTTP_FORBIDDEN,
6870
"This request is not authorized to perform blob overwrites."),
71+
INVALID_RANGE("InvalidRange", AbfsHttpConstants.HTTP_INVALID_RANGE,
72+
"The range specified is invalid for the current size of the resource."),
6973
UNKNOWN(null, -1, null);
7074

7175
private final String errorCode;

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/AbfsAdaptiveInputStream.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ protected int readOneBlock(final byte[] b, final int off, final int len) throws
6868
// If buffer is empty, then fill the buffer.
6969
if (getBCursor() == getLimit()) {
7070
// If EOF, then return -1
71-
if (getFCursor() >= getContentLength()) {
71+
if (!(shouldRestrictGpsOnOpenFile() && isFirstRead()) && getFCursor() >= getContentLength()) {
7272
return -1;
7373
}
7474

@@ -83,7 +83,14 @@ protected int readOneBlock(final byte[] b, final int off, final int len) throws
8383

8484
// Reset Read Type back to normal and set again based on code flow.
8585
getTracingContext().setReadType(ReadType.NORMAL_READ);
86-
if (shouldAlwaysReadBufferSize()) {
86+
87+
// If restrictGpsOnOpenFile config is enabled, skip prefetch for the first read since contentLength
88+
// is not available yet.
89+
if (shouldRestrictGpsOnOpenFile() && isFirstRead()) {
90+
LOG.debug("RestrictGpsOnOpenFile is enabled. Skip readahead for first read.");
91+
bytesRead = readInternal(getFCursor(), getBuffer(), 0, getBufferSize(), true);
92+
}
93+
else if (shouldAlwaysReadBufferSize()) {
8794
bytesRead = readInternal(getFCursor(), getBuffer(), 0, getBufferSize(), false);
8895
} else {
8996
// Enable readAhead when reading sequentially

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/AbfsDfsClient.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1347,7 +1347,7 @@ public AbfsRestOperation checkAccess(String path,
13471347
public boolean checkIsDir(AbfsHttpOperation result) {
13481348
String resourceType = result.getResponseHeader(
13491349
HttpHeaderConfigurations.X_MS_RESOURCE_TYPE);
1350-
return StringUtils.equalsIgnoreCase(resourceType, DIRECTORY);
1350+
return resourceType != null && StringUtils.equalsIgnoreCase(resourceType, DIRECTORY);
13511351
}
13521352

13531353
/**

0 commit comments

Comments
 (0)