Skip to content

Commit 416198a

Browse files
committed
HDDS-11463. Add SCM RPC support for DataNode volume info reporting.
1 parent b968353 commit 416198a

File tree

9 files changed

+163
-1
lines changed

9 files changed

+163
-1
lines changed

hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo;
3333
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
3434
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto;
35+
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.GetVolumeInfosResponseProto;
3536
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto;
3637
import org.apache.hadoop.hdds.scm.DatanodeAdminError;
3738
import org.apache.hadoop.hdds.scm.container.ContainerID;
@@ -476,4 +477,12 @@ DecommissionScmResponseProto decommissionScm(
476477
* @throws IOException On error
477478
*/
478479
void reconcileContainer(long containerID) throws IOException;
480+
481+
/**
482+
* Get getVolumeInfos based on query conditions.
483+
*
484+
* @return Volume Information List.
485+
* @throws IOException On error.
486+
*/
487+
GetVolumeInfosResponseProto getVolumeInfos() throws IOException;
479488
}

hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
3232
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
3333
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo;
34+
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos;
3435
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
3536
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto;
3637
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto;
@@ -499,4 +500,13 @@ DecommissionScmResponseProto decommissionScm(
499500
* @throws IOException On error
500501
*/
501502
void reconcileContainer(long containerID) throws IOException;
503+
504+
/**
505+
* Retrieves volume information based on the specified query parameters.
506+
*
507+
* @return Volume Information List.
508+
* @throws IOException
509+
* I/O exceptions that may occur during the process of querying the volume.
510+
*/
511+
StorageContainerLocationProtocolProtos.GetVolumeInfosResponseProto getVolumeInfos() throws IOException;
502512
}

hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@
8686
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.GetPipelineResponseProto;
8787
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.GetSafeModeRuleStatusesRequestProto;
8888
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.GetSafeModeRuleStatusesResponseProto;
89+
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.GetVolumeInfosRequestProto;
90+
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.GetVolumeInfosResponseProto;
8991
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.InSafeModeRequestProto;
9092
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ListPipelineRequestProto;
9193
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ListPipelineResponseProto;
@@ -1238,4 +1240,16 @@ public void reconcileContainer(long containerID) throws IOException {
12381240
// TODO check error handling.
12391241
submitRequest(Type.ReconcileContainer, builder -> builder.setReconcileContainerRequest(request));
12401242
}
1243+
1244+
@Override
1245+
public GetVolumeInfosResponseProto getVolumeInfos() throws IOException {
1246+
// Prepare parameters.
1247+
GetVolumeInfosRequestProto.Builder requestBuilder =
1248+
GetVolumeInfosRequestProto.newBuilder();
1249+
// Submit request.
1250+
GetVolumeInfosResponseProto response = submitRequest(Type.GetVolumeFailureInfos,
1251+
builder -> builder.setGetVolumeInfosRequest(requestBuilder.build())).
1252+
getGetVolumeInfosResponse();
1253+
return response;
1254+
}
12411255
}

hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ message ScmContainerLocationRequest {
8686
optional GetMetricsRequestProto getMetricsRequest = 47;
8787
optional ContainerBalancerStatusInfoRequestProto containerBalancerStatusInfoRequest = 48;
8888
optional ReconcileContainerRequestProto reconcileContainerRequest = 49;
89+
optional GetVolumeInfosRequestProto getVolumeInfosRequest = 50;
8990
}
9091

9192
message ScmContainerLocationResponse {
@@ -143,6 +144,7 @@ message ScmContainerLocationResponse {
143144
optional GetMetricsResponseProto getMetricsResponse = 47;
144145
optional ContainerBalancerStatusInfoResponseProto containerBalancerStatusInfoResponse = 48;
145146
optional ReconcileContainerResponseProto reconcileContainerResponse = 49;
147+
optional GetVolumeInfosResponseProto getVolumeInfosResponse = 50;
146148

147149
enum Status {
148150
OK = 1;
@@ -199,6 +201,7 @@ enum Type {
199201
GetMetrics = 43;
200202
GetContainerBalancerStatusInfo = 44;
201203
ReconcileContainer = 45;
204+
GetVolumeFailureInfos = 46;
202205
}
203206

204207
/**
@@ -685,6 +688,13 @@ message ReconcileContainerRequestProto {
685688
message ReconcileContainerResponseProto {
686689
}
687690

691+
message GetVolumeInfosRequestProto {
692+
}
693+
694+
message GetVolumeInfosResponseProto {
695+
repeated VolumeInfoProto volumeInfos = 1;
696+
}
697+
688698
/**
689699
* Protocol used from an HDFS node to StorageContainerManager. See the request
690700
* and response messages for details of the RPC calls.

hadoop-hdds/interface-client/src/main/proto/hdds.proto

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,14 @@ message RemoveScmResponseProto {
304304
optional string scmId = 2;
305305
}
306306

307+
message VolumeInfoProto {
308+
optional DatanodeIDProto dataNodeId = 1;
309+
optional string hostName = 2;
310+
optional string volumeName = 3;
311+
optional bool failed = 4;
312+
optional int64 capacity = 5;
313+
}
314+
307315
enum ReplicationType {
308316
RATIS = 1;
309317
STAND_ALONE = 2;

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeInfo.java

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@
2020
import static org.apache.hadoop.ozone.container.upgrade.UpgradeUtils.toLayoutVersionProto;
2121

2222
import com.google.common.annotations.VisibleForTesting;
23+
import java.util.ArrayList;
2324
import java.util.Collections;
2425
import java.util.HashMap;
2526
import java.util.List;
2627
import java.util.Map;
2728
import java.util.concurrent.locks.ReadWriteLock;
2829
import java.util.concurrent.locks.ReentrantReadWriteLock;
2930
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
31+
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.VolumeInfoProto;
3032
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.CommandQueueReportProto;
3133
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.LayoutVersionProto;
3234
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.MetadataStorageReportProto;
@@ -49,7 +51,7 @@ public class DatanodeInfo extends DatanodeDetails {
4951
private volatile long lastHeartbeatTime;
5052
private long lastStatsUpdatedTime;
5153
private int failedVolumeCount;
52-
54+
private List<VolumeInfoProto> volumeInfos;
5355
private List<StorageReportProto> storageReports;
5456
private List<MetadataStorageReportProto> metadataStorageReports;
5557
private LayoutVersionProto lastKnownLayoutVersion;
@@ -72,6 +74,7 @@ public DatanodeInfo(DatanodeDetails datanodeDetails, NodeStatus nodeStatus,
7274
layoutInfo != null ? layoutInfo.getMetadataLayoutVersion() : 0,
7375
layoutInfo != null ? layoutInfo.getSoftwareLayoutVersion() : 0);
7476
this.storageReports = Collections.emptyList();
77+
this.volumeInfos = Collections.emptyList();
7578
this.nodeStatus = nodeStatus;
7679
this.metadataStorageReports = Collections.emptyList();
7780
this.commandCounts = new HashMap<>();
@@ -155,16 +158,57 @@ public void updateStorageReports(List<StorageReportProto> reports) {
155158
.filter(e -> e.hasFailed() && e.getFailed())
156159
.count();
157160

161+
// We choose to update the status of failed disks during the heartbeat,
162+
// so we can directly retrieve it when querying.
163+
List<VolumeInfoProto> volumeInfoLists = new ArrayList<>();
164+
for (StorageReportProto report : reports) {
165+
166+
String storageLocation = report.getStorageLocation();
167+
long capacity = report.getCapacity();
168+
169+
String hostName = getHostName();
170+
171+
boolean failed = false;
172+
if (report.hasFailed() && report.getFailed()) {
173+
failed = true;
174+
}
175+
176+
VolumeInfoProto volumeFailure =
177+
VolumeInfoProto.newBuilder().
178+
setDataNodeId(getID().toProto()).
179+
setHostName(hostName).
180+
setVolumeName(storageLocation).
181+
setCapacity(capacity).
182+
setFailed(failed).
183+
build();
184+
volumeInfoLists.add(volumeFailure);
185+
}
186+
158187
try {
159188
lock.writeLock().lock();
160189
lastStatsUpdatedTime = Time.monotonicNow();
161190
failedVolumeCount = failedCount;
162191
storageReports = reports;
192+
volumeInfos = Collections.unmodifiableList(volumeInfoLists);
163193
} finally {
164194
lock.writeLock().unlock();
165195
}
166196
}
167197

198+
/**
199+
* Get all volume information.
200+
*
201+
* @return VolumeInfo List.
202+
*/
203+
public List<VolumeInfoProto> getVolumeInfos() {
204+
try {
205+
lock.readLock().lock();
206+
return volumeInfos;
207+
} finally {
208+
lock.readLock().unlock();
209+
}
210+
}
211+
168212
/**
169213
* Updates the datanode metadata storage reports.
170214
*

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@
9393
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.GetPipelineResponseProto;
9494
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.GetSafeModeRuleStatusesRequestProto;
9595
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.GetSafeModeRuleStatusesResponseProto;
96+
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.GetVolumeInfosRequestProto;
97+
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.GetVolumeInfosResponseProto;
9698
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.InSafeModeRequestProto;
9799
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.InSafeModeResponseProto;
98100
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ListPipelineRequestProto;
@@ -739,6 +741,14 @@ public ScmContainerLocationResponse processRequest(
739741
.setStatus(Status.OK)
740742
.setReconcileContainerResponse(reconcileContainer(request.getReconcileContainerRequest()))
741743
.build();
744+
case GetVolumeFailureInfos:
745+
GetVolumeInfosRequestProto getVolumeInfosRequest = request.getGetVolumeInfosRequest();
746+
GetVolumeInfosResponseProto getVolumeInfosResponse = getVolumeInfos(getVolumeInfosRequest);
747+
return ScmContainerLocationResponse.newBuilder()
748+
.setCmdType(request.getCmdType())
749+
.setStatus(Status.OK)
750+
.setGetVolumeInfosResponse(getVolumeInfosResponse)
751+
.build();
742752
default:
743753
throw new IllegalArgumentException(
744754
"Unknown command type: " + request.getCmdType());
@@ -1359,6 +1369,22 @@ public GetMetricsResponseProto getMetrics(GetMetricsRequestProto request) throws
13591369
return GetMetricsResponseProto.newBuilder().setMetricsJson(impl.getMetrics(request.getQuery())).build();
13601370
}
13611371

1372+
/**
1373+
* Get getVolumeInfos based on query conditions.
1374+
*
1375+
* @param request The request object containing the parameters to
1376+
* fetch volume information (GetVolumeInfosRequestProto).
1377+
* @return A response object containing the volume information
1378+
* (GetVolumeInfosResponseProto).
1379+
* @throws IOException
1380+
* If an input/output exception occurs while processing the request.
1381+
*/
1382+
public GetVolumeInfosResponseProto getVolumeInfos(
1383+
GetVolumeInfosRequestProto request) throws IOException {
1384+
// Invoke method and return result
1385+
return impl.getVolumeInfos();
1386+
}
1387+
13621388
public ReconcileContainerResponseProto reconcileContainer(ReconcileContainerRequestProto request) throws IOException {
13631389
impl.reconcileContainer(request.getContainerID());
13641390
return ReconcileContainerResponseProto.getDefaultInstance();

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import java.util.UUID;
5050
import java.util.stream.Collectors;
5151
import java.util.stream.Stream;
52+
import org.apache.commons.collections.CollectionUtils;
5253
import org.apache.commons.lang3.tuple.Pair;
5354
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
5455
import org.apache.hadoop.hdds.client.ReplicationConfig;
@@ -58,11 +59,13 @@
5859
import org.apache.hadoop.hdds.protocol.DatanodeID;
5960
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
6061
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo;
62+
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.VolumeInfoProto;
6163
import org.apache.hadoop.hdds.protocol.proto.ReconfigureProtocolProtos.ReconfigureProtocolService;
6264
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos;
6365
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
6466
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto;
6567
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto.Builder;
68+
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.GetVolumeInfosResponseProto;
6669
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto;
6770
import org.apache.hadoop.hdds.protocolPB.ReconfigureProtocolPB;
6871
import org.apache.hadoop.hdds.protocolPB.ReconfigureProtocolServerSideTranslatorPB;
@@ -90,7 +93,9 @@
9093
import org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes;
9194
import org.apache.hadoop.hdds.scm.ha.SCMRatisServer;
9295
import org.apache.hadoop.hdds.scm.ha.SCMRatisServerImpl;
96+
import org.apache.hadoop.hdds.scm.node.DatanodeInfo;
9397
import org.apache.hadoop.hdds.scm.node.DatanodeUsageInfo;
98+
import org.apache.hadoop.hdds.scm.node.NodeManager;
9499
import org.apache.hadoop.hdds.scm.node.NodeStatus;
95100
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
96101
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
@@ -1679,4 +1684,34 @@ public void reconcileContainer(long longContainerID) throws IOException {
16791684
throw ex;
16801685
}
16811686
}
1687+
1688+
@Override
1689+
public StorageContainerLocationProtocolProtos.GetVolumeInfosResponseProto getVolumeInfos() throws IOException {
1690+
GetVolumeInfosResponseProto.Builder getVolumeInfosResponseBuilder =
1691+
GetVolumeInfosResponseProto.newBuilder();
1692+
NodeManager scmNodeManager = scm.getScmNodeManager();
1693+
List<? extends DatanodeDetails> allNodes = scmNodeManager.getAllNodes();
1694+
// If the filtered list is empty, we will return directly.
1695+
if (CollectionUtils.isEmpty(allNodes)) {
1696+
return getVolumeInfosResponseBuilder.build();
1697+
}
1698+
// We convert it to a list of VolumeInfoProto.
1699+
List<VolumeInfoProto> volumeInfos = convertToVolumeInfos(allNodes);
1700+
if (CollectionUtils.isNotEmpty(volumeInfos)) {
1701+
getVolumeInfosResponseBuilder.addAllVolumeInfos(volumeInfos);
1702+
}
1703+
return getVolumeInfosResponseBuilder.build();
1704+
}
1705+
1706+
private List<VolumeInfoProto> convertToVolumeInfos(List<? extends DatanodeDetails> allNodes) {
1707+
List<VolumeInfoProto> result = new ArrayList<>();
1708+
for (DatanodeDetails datanode : allNodes) {
1709+
DatanodeInfo detail = (DatanodeInfo) datanode;
1710+
List<VolumeInfoProto> volumeInfos = detail.getVolumeInfos();
1711+
if (CollectionUtils.isNotEmpty(volumeInfos)) {
1712+
result.addAll(volumeInfos);
1713+
}
1714+
}
1715+
return result;
1716+
}
16821717
}

hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ReadContainerResponseProto;
4141
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
4242
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo;
43+
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos;
4344
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
4445
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto;
4546
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto;
@@ -607,4 +608,9 @@ public String getMetrics(String query) throws IOException {
607608
public void reconcileContainer(long id) throws IOException {
608609
storageContainerLocationClient.reconcileContainer(id);
609610
}
611+
612+
@Override
613+
public StorageContainerLocationProtocolProtos.GetVolumeInfosResponseProto getVolumeInfos() throws IOException {
614+
return storageContainerLocationClient.getVolumeInfos();
615+
}
610616
}

0 commit comments

Comments
 (0)