1616
1717package com .google .jenkins .plugins .computeengine ;
1818
19- import static com .google .jenkins .plugins .computeengine .ComputeEngineCloud .CLOUD_ID_LABEL_KEY ;
2019import static java .util .Collections .emptyList ;
2120
2221import com .google .api .services .compute .model .Instance ;
22+ import com .google .common .annotations .VisibleForTesting ;
2323import com .google .common .collect .ImmutableMap ;
24+ import com .google .jenkins .plugins .computeengine .client .ComputeClientV2 ;
2425import hudson .Extension ;
2526import hudson .model .PeriodicWork ;
2627import hudson .model .Slave ;
2728import java .io .IOException ;
29+ import java .security .GeneralSecurityException ;
30+ import java .time .LocalDateTime ;
31+ import java .time .OffsetDateTime ;
32+ import java .time .ZoneOffset ;
33+ import java .time .format .DateTimeFormatter ;
34+ import java .time .temporal .ChronoUnit ;
2835import java .util .List ;
29- import java .util .Map ;
3036import java .util .Set ;
3137import java .util .logging .Level ;
3238import java .util .logging .Logger ;
3945@ Symbol ("cleanLostNodesWork" )
4046public class CleanLostNodesWork extends PeriodicWork {
4147 protected final Logger logger = Logger .getLogger (getClass ().getName ());
48+ public static final String NODE_IN_USE_LABEL_KEY = "jenkins_node_last_refresh" ;
49+ public static final long RECURRENCE_PERIOD = Long .parseLong (
50+ System .getProperty (CleanLostNodesWork .class .getName () + ".recurrencePeriod" , String .valueOf (HOUR )));
51+
52+ @ VisibleForTesting
53+ public static final int LOST_MULTIPLIER = 3 ;
54+ /**
55+ * The formatter for the label timestamp value as per google label format,
56+ * "The value can only contain lowercase letters, numeric characters, underscores and dashes.
57+ * The value can be at most 63 characters long. International characters are allowed".
58+ */
59+ private static final DateTimeFormatter formatter = DateTimeFormatter .ofPattern ("yyyy_MM_dd't'HH_mm_ss_SSS'z'" );
4260
4361 /** {@inheritDoc} */
4462 @ Override
4563 public long getRecurrencePeriod () {
46- return HOUR ;
64+ return RECURRENCE_PERIOD ;
65+ }
66+
67+ public static String getLastRefreshLabelVal () {
68+ return formatter .format (OffsetDateTime .now (ZoneOffset .UTC ));
4769 }
4870
4971 /** {@inheritDoc} */
@@ -55,22 +77,50 @@ protected void doRun() {
5577
5678 private void cleanCloud (ComputeEngineCloud cloud ) {
5779 logger .log (Level .FINEST , "Cleaning cloud " + cloud .getCloudName ());
58- List <Instance > remoteInstances = findRemoteInstances (cloud );
80+ ComputeClientV2 clientV2 ;
81+ try {
82+ clientV2 = cloud .getClientV2 ();
83+ } catch (GeneralSecurityException | IOException ex ) {
84+ logger .log (Level .WARNING , "Error getting clientV2 for cloud " + cloud .getCloudName (), ex );
85+ return ;
86+ }
87+ List <Instance > remoteInstances = findRunningRemoteInstances (clientV2 );
5988 Set <String > localInstances = findLocalInstances (cloud );
89+ if (!(localInstances .isEmpty () || remoteInstances .isEmpty ())) {
90+ updateLocalInstancesLabel (clientV2 , localInstances , remoteInstances );
91+ }
6092 remoteInstances .stream ()
6193 .filter (remote -> isOrphaned (remote , localInstances ))
6294 .forEach (remote -> terminateInstance (remote , cloud ));
6395 }
6496
6597 private boolean isOrphaned (Instance remote , Set <String > localInstances ) {
66- String instanceName = remote .getName ();
67- logger .log (Level .FINEST , "Checking instance " + instanceName );
68- return !localInstances .contains (instanceName );
98+ /* It is necessary to check if the remote instance is present in localInstances.
99+ The `remote` instance has an old timestamp because it hasn't been fetched again
100+ after the `updateLocalInstancesLabel` call, to avoid extra network calls.
101+ */
102+ if (localInstances .contains (remote .getName ())) {
103+ return false ;
104+ }
105+ String nodeLastRefresh = remote .getLabels ().get (NODE_IN_USE_LABEL_KEY );
106+ if (nodeLastRefresh == null ) {
107+ return false ;
108+ }
109+ OffsetDateTime lastRefresh =
110+ LocalDateTime .parse (nodeLastRefresh , formatter ).atOffset (ZoneOffset .UTC );
111+ boolean isOrphan = lastRefresh
112+ .plus (RECURRENCE_PERIOD * LOST_MULTIPLIER , ChronoUnit .MILLIS )
113+ .isBefore (OffsetDateTime .now (ZoneOffset .UTC ));
114+ logger .log (
115+ Level .FINEST ,
116+ () -> "Instance " + remote .getName () + " last_refresh label value: " + nodeLastRefresh + ", isOrphan: "
117+ + isOrphan );
118+ return isOrphan ;
69119 }
70120
71121 private void terminateInstance (Instance remote , ComputeEngineCloud cloud ) {
72122 String instanceName = remote .getName ();
73- logger .log (Level .INFO , "Remote instance " + instanceName + " not found locally, removing it" );
123+ logger .log (Level .INFO , "Removing orphaned instance: " + instanceName );
74124 try {
75125 cloud .getClient ().terminateInstanceAsync (cloud .getProjectId (), remote .getZone (), instanceName );
76126 } catch (IOException ex ) {
@@ -86,27 +136,47 @@ private List<ComputeEngineCloud> getClouds() {
86136 }
87137
88138 private Set <String > findLocalInstances (ComputeEngineCloud cloud ) {
89- return Jenkins .get ().getNodes ().stream ()
139+ var localInstances = Jenkins .get ().getNodes ().stream ()
90140 .filter (node -> node instanceof ComputeEngineInstance )
91141 .map (node -> (ComputeEngineInstance ) node )
92142 .filter (node -> node .getCloud ().equals (cloud ))
93143 .map (Slave ::getNodeName )
94144 .collect (Collectors .toSet ());
145+ logger .log (Level .FINEST , () -> "Found " + localInstances .size () + " local instances" );
146+ return localInstances ;
95147 }
96148
97- private List <Instance > findRemoteInstances (ComputeEngineCloud cloud ) {
98- Map <String , String > filterLabel = ImmutableMap .of (CLOUD_ID_LABEL_KEY , cloud .getInstanceId ());
149+ private List <Instance > findRunningRemoteInstances (ComputeClientV2 clientV2 ) {
99150 try {
100- return cloud . getClient (). listInstancesWithLabel ( cloud . getProjectId (), filterLabel ). stream ()
101- . filter ( instance -> shouldTerminateStatus ( instance . getStatus ()))
102- . collect ( Collectors . toList ()) ;
151+ var remoteInstances = clientV2 . retrieveInstanceByLabelKeyAndStatus ( NODE_IN_USE_LABEL_KEY , "RUNNING" );
152+ logger . log ( Level . FINEST , () -> "Found " + remoteInstances . size () + " running remote instances" );
153+ return remoteInstances ;
103154 } catch (IOException ex ) {
104155 logger .log (Level .WARNING , "Error finding remote instances" , ex );
105156 return emptyList ();
106157 }
107158 }
108159
109- private boolean shouldTerminateStatus (String status ) {
110- return !status .equals ("STOPPING" );
160+ /**
161+ * Updates the label of the local instances to indicate they are still in use. The method makes N network calls
162+ * for N local instances, couldn't find any bulk update apis.
163+ */
164+ private void updateLocalInstancesLabel (
165+ ComputeClientV2 clientV2 , Set <String > localInstances , List <Instance > remoteInstances ) {
166+ var remoteInstancesByName =
167+ remoteInstances .stream ().collect (Collectors .toMap (Instance ::getName , instance -> instance ));
168+ var labelToUpdate = ImmutableMap .of (NODE_IN_USE_LABEL_KEY , getLastRefreshLabelVal ());
169+ for (String instanceName : localInstances ) {
170+ var remoteInstance = remoteInstancesByName .get (instanceName );
171+ if (remoteInstance == null ) {
172+ continue ;
173+ }
174+ try {
175+ clientV2 .updateInstanceLabels (remoteInstance , labelToUpdate );
176+ logger .log (Level .FINEST , () -> "Updated label for instance " + instanceName );
177+ } catch (IOException e ) {
178+ logger .log (Level .WARNING , "Error updating label for instance " + instanceName , e );
179+ }
180+ }
111181 }
112182}
0 commit comments