Skip to content

Commit c68a942

Browse files
committed
[PLAT-20717]Collect File API does not show failed nodes when they are unreachable
Summary: Add a connectivity check before any of the file-collections/run-script APIs. Add a runtime config to configure this timeout Test Plan: ``` curl -X POST "http://localhost:9000/api/v2/customers/<>/universes/<>/run-script" -H "Content-Type: application/json" -H "X-AUTH-YW-API-TOKEN: <>" -d '{ > "script_options": { > "script_content": "echo TEST_INLINE; echo COUNT=$#; echo ARG1=$1; echo ARG2=$2; echo ALL=$@", > "params": ["hello", "world"], > "timeout_secs": 30 > } > }' ``` ``` {"summary":{"total_nodes":3,"successful_nodes":2,"failed_nodes":1,"total_execution_time_ms":27667,"all_succeeded":false},"results":{"yb-admin-dkumar-1-n1":{"node_name":"yb-admin-dkumar-1-n1","node_address":"<>","exit_code":0,"stdout":"Command output:TEST_INLINE\nCOUNT=2\nARG1=hello\nARG2=world\nALL=hello world\n","execution_time_ms":127,"success":true},"yb-admin-dkumar-1-n2":{"node_name":"yb-admin-dkumar-1-n2","node_address":"<>","exit_code":-1,"stdout":"","execution_time_ms":27665,"success":false,"error_message":"Node is unreachable"},"yb-admin-dkumar-1-n3":{"node_name":"yb-admin-dkumar-1-n3","node_address":"<>","exit_code":0,"stdout":"Command output:TEST_INLINE\nCOUNT=2\nARG1=hello\nARG2=world\nALL=hello world\n","execution_time_ms":133,"success":true}}} ``` Reviewers: rajiv.kumar, nsingh, anabaria Reviewed By: anabaria Subscribers: yugaware Differential Revision: https://phorge.dev.yugabyte.com/D53021
1 parent fc30f53 commit c68a942

7 files changed

Lines changed: 77 additions & 0 deletions

File tree

managed/RUNTIME-FLAGS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@
135135
| "OIDC feature enhancements" | "yb.security.oidc_feature_enhancements" | "GLOBAL" | "Enables the OIDC enhancements such as auth_token retrieval, user registration in YBA on login, etc." | "Boolean" |
136136
| "Devops command timeout" | "yb.devops.command_timeout" | "GLOBAL" | "Devops command timeout" | "Duration" |
137137
| "Node destroy command timeout" | "yb.node_ops.destroy_server_timeout" | "GLOBAL" | "Timeout for node destroy command before failing." | "Duration" |
138+
| "Node reachability check timeout" | "yb.node_script.reachability_check_timeout_sec" | "GLOBAL" | "Timeout in seconds for the pre-check that determines whether a node is reachable for per-node platform APIs such as run-script and file-collections. Unreachable nodes are reported as failed nodes rather than producing per-file/per-command errors." | "Integer" |
138139
| "YBC Compatible DB Version" | "ybc.compatible_db_version" | "GLOBAL" | "Minimum YBDB version which supports YBC" | "String" |
139140
| "Force YBC Shutdown during upgrade" | "ybc.upgrade.force_shutdown" | "GLOBAL" | "For YBC Shutdown during upgrade" | "Boolean" |
140141
| "Enable strict mode to ignore deprecated YBA APIs" | "yb.api.mode.strict" | "GLOBAL" | "Will ignore deprecated APIs" | "Boolean" |

managed/src/main/java/com/yugabyte/yw/common/FileCollectionDownloader.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,16 @@ private boolean cleanupNodeTarFile(Universe universe, NodeDetails node, String r
427427
return false;
428428
}
429429

430+
// Fail fast on unreachable nodes; otherwise rm-over-SSH will burn the full cleanup timeout
431+
// and surface as a generic "failed to clean up" instead of "node unreachable".
432+
if (!nodeUniverseManager.isNodeReachable(node, universe)) {
433+
log.warn(
434+
"Node {} is unreachable, skipping remote tar cleanup at {}",
435+
node.nodeName,
436+
remoteTarPath);
437+
return false;
438+
}
439+
430440
nodeUniverseManager.runCommand(
431441
node, universe, List.of("rm", "-f", remoteTarPath), ShellProcessContext.DEFAULT);
432442
log.info("Cleaned up remote tar file {} on node {}", remoteTarPath, node.nodeName);
@@ -505,6 +515,18 @@ private NodeDownloadResult downloadFromNode(
505515
.build();
506516
}
507517

518+
// Fail fast on unreachable nodes; otherwise the per-node probes below swallow the
519+
// connection error and surface as a misleading "Remote tar file not found".
520+
if (!nodeUniverseManager.isNodeReachable(node, universe)) {
521+
return NodeDownloadResult.builder()
522+
.nodeName(node.nodeName)
523+
.nodeAddress(node.cloudInfo.private_ip)
524+
.success(false)
525+
.errorMessage("Node is unreachable")
526+
.downloadTimeMs(System.currentTimeMillis() - startTime)
527+
.build();
528+
}
529+
508530
// Check if remote file exists
509531
if (!nodeUniverseManager.checkNodeIfFileExists(node, universe, remoteTarPath)) {
510532
return NodeDownloadResult.builder()

managed/src/main/java/com/yugabyte/yw/common/NodeFileCollector.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,24 @@ private NodeResult collectFromNode(Universe universe, NodeDetails node, Collecti
247247
int failed = 0;
248248
long bytesCollected = 0;
249249

250+
// Fail fast on unreachable nodes; otherwise per-file probes swallow the
251+
// connection error and the node is mis-reported as successful with file-level failures.
252+
if (!nodeUniverseManager.isNodeReachable(node, universe)) {
253+
long executionTime = System.currentTimeMillis() - nodeStartTime;
254+
return NodeResult.builder()
255+
.nodeName(node.nodeName)
256+
.nodeAddress(node.cloudInfo.private_ip)
257+
.success(false)
258+
.filesCollected(0)
259+
.filesSkipped(0)
260+
.filesFailed(0)
261+
.totalBytesCollected(0)
262+
.executionTimeMs(executionTime)
263+
.errorMessage("Node is unreachable")
264+
.files(new ArrayList<>())
265+
.build();
266+
}
267+
250268
try {
251269
// Collect all file paths to tar
252270
List<String> filesToCollect = new ArrayList<>();

managed/src/main/java/com/yugabyte/yw/common/NodeScriptRunner.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,19 @@ public ExecutionResult runScript(
178178
private NodeResult executeOnNode(Universe universe, NodeDetails node, ScriptParams scriptParams) {
179179
long nodeStartTime = System.currentTimeMillis();
180180

181+
if (!nodeUniverseManager.isNodeReachable(node, universe)) {
182+
long executionTime = System.currentTimeMillis() - nodeStartTime;
183+
return NodeResult.builder()
184+
.nodeName(node.nodeName)
185+
.nodeAddress(node.cloudInfo.private_ip)
186+
.exitCode(-1)
187+
.stdout("")
188+
.errorMessage("Node is unreachable")
189+
.executionTimeMs(executionTime)
190+
.success(false)
191+
.build();
192+
}
193+
181194
try {
182195
ShellProcessContext.ShellProcessContextBuilder contextBuilder =
183196
ShellProcessContext.builder()

managed/src/main/java/com/yugabyte/yw/common/NodeUniverseManager.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,17 @@ public boolean isNodeReachable(NodeDetails node, Universe universe, long timeout
801801
return false;
802802
}
803803

804+
/**
805+
* Reachability pre-check for per-node platform APIs (run-script, file-collections, ...) that uses
806+
* the shared {@code yb.node_script.reachability_check_timeout_sec} runtime flag so a single knob
807+
* controls the timeout across all callers.
808+
*/
809+
public boolean isNodeReachable(NodeDetails node, Universe universe) {
810+
long timeoutSecs =
811+
confGetter.getGlobalConf(GlobalConfKeys.nodeScriptReachabilityCheckTimeoutSec);
812+
return isNodeReachable(node, universe, timeoutSecs);
813+
}
814+
804815
/**
805816
* Gets a map of all the absolute file paths to sizes at a given remote directory
806817
*

managed/src/main/java/com/yugabyte/yw/common/config/GlobalConfKeys.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1191,6 +1191,17 @@ public class GlobalConfKeys extends RuntimeConfigKeysModule {
11911191
"Timeout for node destroy command before failing.",
11921192
ConfDataType.DurationType,
11931193
ImmutableList.of(ConfKeyTags.PUBLIC));
1194+
public static final ConfKeyInfo<Integer> nodeScriptReachabilityCheckTimeoutSec =
1195+
new ConfKeyInfo<>(
1196+
"yb.node_script.reachability_check_timeout_sec",
1197+
ScopeType.GLOBAL,
1198+
"Node reachability check timeout",
1199+
"Timeout in seconds for the pre-check that determines whether a node is reachable for"
1200+
+ " per-node platform APIs such as run-script and file-collections. Unreachable"
1201+
+ " nodes are reported as failed nodes rather than producing per-file/per-command"
1202+
+ " errors.",
1203+
ConfDataType.IntegerType,
1204+
ImmutableList.of(ConfKeyTags.PUBLIC));
11941205
public static final ConfKeyInfo<String> ybcCompatibleDbVersion =
11951206
new ConfKeyInfo<>(
11961207
"ybc.compatible_db_version",

managed/src/main/resources/reference.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1556,6 +1556,7 @@ yb {
15561556

15571557
node_script {
15581558
enabled = false
1559+
reachability_check_timeout_sec = 30
15591560
}
15601561

15611562
gflags {

0 commit comments

Comments
 (0)