|
101 | 101 | import io.fabric8.kubernetes.api.model.DeletionPropagation;
|
102 | 102 | import io.fabric8.kubernetes.api.model.ObjectMeta;
|
103 | 103 | import io.fabric8.kubernetes.api.model.PodList;
|
| 104 | +import io.fabric8.kubernetes.api.model.apps.Deployment; |
104 | 105 | import io.fabric8.kubernetes.client.KubernetesClient;
|
105 |
| -import io.fabric8.kubernetes.client.KubernetesClientTimeoutException; |
| 106 | +import io.fabric8.kubernetes.client.KubernetesClientException; |
| 107 | +import io.fabric8.kubernetes.client.dsl.Resource; |
| 108 | +import io.fabric8.kubernetes.client.dsl.Waitable; |
| 109 | +import lombok.SneakyThrows; |
106 | 110 | import org.apache.commons.lang3.ObjectUtils;
|
107 | 111 | import org.slf4j.Logger;
|
108 | 112 | import org.slf4j.LoggerFactory;
|
|
112 | 116 | import java.io.File;
|
113 | 117 | import java.io.FileOutputStream;
|
114 | 118 | import java.io.IOException;
|
| 119 | +import java.net.HttpURLConnection; |
115 | 120 | import java.net.InetSocketAddress;
|
116 | 121 | import java.net.MalformedURLException;
|
117 | 122 | import java.net.Socket;
|
118 | 123 | import java.net.SocketAddress;
|
119 | 124 | import java.net.URL;
|
120 | 125 | import java.nio.file.Files;
|
121 | 126 | import java.nio.file.Paths;
|
| 127 | +import java.time.Duration; |
122 | 128 | import java.util.Arrays;
|
123 | 129 | import java.util.Collection;
|
124 | 130 | import java.util.Collections;
|
|
127 | 133 | import java.util.Map;
|
128 | 134 | import java.util.Objects;
|
129 | 135 | import java.util.Optional;
|
| 136 | +import java.util.concurrent.Callable; |
130 | 137 | import java.util.concurrent.CompletableFuture;
|
131 | 138 | import java.util.concurrent.ExecutorService;
|
132 | 139 | import java.util.concurrent.TimeUnit;
|
@@ -399,7 +406,6 @@ protected void cancelJob(
|
399 | 406 | // Unless we leave the jm around after savepoint, we should wait until it has finished
|
400 | 407 | // shutting down
|
401 | 408 | if (deleteClusterAfterSavepoint || upgradeMode != UpgradeMode.SAVEPOINT) {
|
402 |
| - waitForClusterShutdown(conf); |
403 | 409 | deploymentStatus.setJobManagerDeploymentStatus(JobManagerDeploymentStatus.MISSING);
|
404 | 410 | }
|
405 | 411 | }
|
@@ -755,14 +761,6 @@ public PodList getJmPodList(FlinkDeployment deployment, Configuration conf) {
|
755 | 761 | return getJmPodList(namespace, clusterId);
|
756 | 762 | }
|
757 | 763 |
|
758 |
| - @Override |
759 |
| - public void waitForClusterShutdown(Configuration conf) { |
760 |
| - waitForClusterShutdown( |
761 |
| - conf.getString(KubernetesConfigOptions.NAMESPACE), |
762 |
| - conf.getString(KubernetesConfigOptions.CLUSTER_ID), |
763 |
| - operatorConfig.getFlinkShutdownClusterTimeout().toSeconds()); |
764 |
| - } |
765 |
| - |
766 | 764 | @Override
|
767 | 765 | public RestClusterClient<String> getClusterClient(Configuration conf) throws Exception {
|
768 | 766 | final String clusterId = conf.get(KubernetesConfigOptions.CLUSTER_ID);
|
@@ -899,50 +897,20 @@ public JobDetailsInfo getJobDetailsInfo(JobID jobID, Configuration conf) throws
|
899 | 897 | }
|
900 | 898 | }
|
901 | 899 |
|
902 |
| - /** Returns a list of Kubernetes Deployment names for given cluster. */ |
903 |
| - protected abstract List<String> getDeploymentNames(String namespace, String clusterId); |
904 |
| - |
905 |
| - /** Wait until the FLink cluster has completely shut down. */ |
906 |
| - protected void waitForClusterShutdown( |
907 |
| - String namespace, String clusterId, long shutdownTimeout) { |
908 |
| - long timeoutAt = System.currentTimeMillis() + shutdownTimeout * 1000; |
909 |
| - LOG.info("Waiting {} seconds for cluster shutdown...", shutdownTimeout); |
910 |
| - |
911 |
| - for (var deploymentName : getDeploymentNames(namespace, clusterId)) { |
912 |
| - long deploymentTimeout = timeoutAt - System.currentTimeMillis(); |
913 |
| - |
914 |
| - if (!waitForDeploymentToBeRemoved(namespace, deploymentName, deploymentTimeout)) { |
915 |
| - LOG.error( |
916 |
| - "Failed to shut down cluster {} (deployment {}) in {} seconds, proceeding...", |
917 |
| - clusterId, |
918 |
| - deploymentName, |
919 |
| - shutdownTimeout); |
920 |
| - return; |
921 |
| - } |
922 |
| - } |
923 |
| - } |
924 |
| - |
925 |
| - /** Wait until Deployment is removed, return false if timed out, otherwise return true. */ |
| 900 | + /** Wait until Deployment is removed, return remaining timeout. */ |
926 | 901 | @VisibleForTesting
|
927 |
| - boolean waitForDeploymentToBeRemoved(String namespace, String deploymentName, long timeout) { |
928 |
| - LOG.info( |
929 |
| - "Waiting for Deployment {} to shut down with {} seconds timeout...", |
930 |
| - deploymentName, |
931 |
| - timeout / 1000); |
932 |
| - |
933 |
| - try { |
934 |
| - kubernetesClient |
935 |
| - .apps() |
936 |
| - .deployments() |
937 |
| - .inNamespace(namespace) |
938 |
| - .withName(deploymentName) |
939 |
| - .waitUntilCondition(Objects::isNull, timeout, TimeUnit.MILLISECONDS); |
940 |
| - |
941 |
| - LOG.info("Deployment {} successfully shut down", deploymentName); |
942 |
| - } catch (KubernetesClientTimeoutException e) { |
943 |
| - return false; |
944 |
| - } |
945 |
| - return true; |
| 902 | + protected Duration deleteDeploymentBlocking( |
| 903 | + String name, |
| 904 | + Resource<Deployment> deployment, |
| 905 | + DeletionPropagation propagation, |
| 906 | + Duration timeout) { |
| 907 | + return deleteBlocking( |
| 908 | + String.format("Deleting %s Deployment", name), |
| 909 | + () -> { |
| 910 | + deployment.withPropagationPolicy(propagation).delete(); |
| 911 | + return deployment; |
| 912 | + }, |
| 913 | + timeout); |
946 | 914 | }
|
947 | 915 |
|
948 | 916 | private static List<JobStatusMessage> toJobStatusMessage(
|
@@ -1050,33 +1018,35 @@ public final void deleteClusterDeployment(
|
1050 | 1018 | Configuration conf,
|
1051 | 1019 | boolean deleteHaData) {
|
1052 | 1020 |
|
| 1021 | + var namespace = meta.getNamespace(); |
| 1022 | + var clusterId = meta.getName(); |
| 1023 | + |
1053 | 1024 | var deletionPropagation = operatorConfig.getDeletionPropagation();
|
1054 | 1025 | LOG.info("Deleting cluster with {} propagation", deletionPropagation);
|
1055 |
| - deleteClusterInternal(meta, conf, deleteHaData, deletionPropagation); |
| 1026 | + deleteClusterInternal(namespace, clusterId, conf, deletionPropagation); |
| 1027 | + if (deleteHaData) { |
| 1028 | + deleteHAData(namespace, clusterId, conf); |
| 1029 | + } else { |
| 1030 | + LOG.info("Keeping HA metadata for last-state restore"); |
| 1031 | + } |
1056 | 1032 | updateStatusAfterClusterDeletion(status);
|
1057 | 1033 | }
|
1058 | 1034 |
|
1059 | 1035 | /**
|
1060 |
| - * Delete Flink kubernetes cluster by deleting the kubernetes resources directly. Optionally |
1061 |
| - * allows deleting the native kubernetes HA resources as well. |
| 1036 | + * Delete Flink kubernetes cluster by deleting the kubernetes resources directly. |
1062 | 1037 | *
|
1063 |
| - * @param meta ObjectMeta of the deployment |
| 1038 | + * @param namespace Namespace |
| 1039 | + * @param clusterId ClusterId |
1064 | 1040 | * @param conf Configuration of the Flink application
|
1065 |
| - * @param deleteHaData Flag to indicate whether k8s or Zookeeper HA metadata should be removed |
1066 |
| - * as well |
1067 | 1041 | * @param deletionPropagation Resource deletion propagation policy
|
1068 | 1042 | */
|
1069 | 1043 | protected abstract void deleteClusterInternal(
|
1070 |
| - ObjectMeta meta, |
| 1044 | + String namespace, |
| 1045 | + String clusterId, |
1071 | 1046 | Configuration conf,
|
1072 |
| - boolean deleteHaData, |
1073 | 1047 | DeletionPropagation deletionPropagation);
|
1074 | 1048 |
|
1075 | 1049 | protected void deleteHAData(String namespace, String clusterId, Configuration conf) {
|
1076 |
| - // We need to wait for cluster shutdown otherwise HA data might be recreated |
1077 |
| - waitForClusterShutdown( |
1078 |
| - namespace, clusterId, operatorConfig.getFlinkShutdownClusterTimeout().toSeconds()); |
1079 |
| - |
1080 | 1050 | if (FlinkUtils.isKubernetesHAActivated(conf)) {
|
1081 | 1051 | LOG.info("Deleting Kubernetes HA metadata");
|
1082 | 1052 | FlinkUtils.deleteKubernetesHAMetadata(clusterId, namespace, kubernetesClient);
|
@@ -1134,4 +1104,48 @@ private Configuration getOperatorRestConfig(Configuration origConfig) throws IOE
|
1134 | 1104 | });
|
1135 | 1105 | return conf;
|
1136 | 1106 | }
|
| 1107 | + |
| 1108 | + /** |
| 1109 | + * Generic blocking delete operation implementation for triggering and waiting for removal of |
| 1110 | + * the selected resources. By returning the remaining timeout we allow chaining multiple delete |
| 1111 | + * operations under a single timeout setting easily. |
| 1112 | + * |
| 1113 | + * @param operation Name of the operation for logging |
| 1114 | + * @param delete Call that should trigger the async deletion and return the resource to be |
| 1115 | + * watched |
| 1116 | + * @param timeout Timeout for the operation |
| 1117 | + * @return Remaining timeout after deletion. |
| 1118 | + */ |
| 1119 | + @SneakyThrows |
| 1120 | + protected static Duration deleteBlocking( |
| 1121 | + String operation, Callable<Waitable> delete, Duration timeout) { |
| 1122 | + LOG.info("{} with {} seconds timeout...", operation, timeout.toSeconds()); |
| 1123 | + long start = System.currentTimeMillis(); |
| 1124 | + |
| 1125 | + Waitable deleted = null; |
| 1126 | + try { |
| 1127 | + deleted = delete.call(); |
| 1128 | + } catch (KubernetesClientException kce) { |
| 1129 | + // During the deletion we need to throw other types of errors |
| 1130 | + if (kce.getCode() != HttpURLConnection.HTTP_NOT_FOUND) { |
| 1131 | + throw kce; |
| 1132 | + } |
| 1133 | + } |
| 1134 | + |
| 1135 | + if (deleted != null) { |
| 1136 | + try { |
| 1137 | + deleted.waitUntilCondition( |
| 1138 | + Objects::isNull, timeout.toMillis(), TimeUnit.MILLISECONDS); |
| 1139 | + LOG.info("Completed {}", operation); |
| 1140 | + } catch (KubernetesClientException kce) { |
| 1141 | + // We completely ignore not found errors and simply log others |
| 1142 | + if (kce.getCode() != HttpURLConnection.HTTP_NOT_FOUND) { |
| 1143 | + LOG.warn("Error while " + operation, kce); |
| 1144 | + } |
| 1145 | + } |
| 1146 | + } |
| 1147 | + |
| 1148 | + long elapsedMillis = System.currentTimeMillis() - start; |
| 1149 | + return Duration.ofMillis(Math.max(0, timeout.toMillis() - elapsedMillis)); |
| 1150 | + } |
1137 | 1151 | }
|
0 commit comments