@@ -141,7 +141,8 @@ def add_starter(name, port, opts, sm, hasAgency):
141
141
self .create_tls_ca_cert ()
142
142
port = 9528
143
143
count = 0
144
- for this_node in list (range (1 , self .props .cluster_nodes + 1 )):
144
+ full_node_count = self .props .cluster_nodes + 2 # we need 2 additional nodes for hotbackup testing
145
+ for this_node in list (range (1 , full_node_count + 1 )):
145
146
node = []
146
147
node_opts .append (node )
147
148
if this_node != 1 :
@@ -153,44 +154,43 @@ def add_starter(name, port, opts, sm, hasAgency):
153
154
add_starter (f"node{ this_node } " , port , node + common_opts , sm , count < 3 )
154
155
port += 100
155
156
count += 1
156
- self .backup_instance_count = count
157
157
for instance in self .starter_instances :
158
158
instance .is_leader = True
159
159
160
160
def starter_run_impl (self ):
161
161
lh .subsection ("instance setup" )
162
- for manager in self .starter_instances :
162
+ for manager in self .starter_instances [: self . props . cluster_nodes ] :
163
163
logging .info ("Spawning instance" )
164
164
manager .run_starter ()
165
165
166
166
logging .info ("waiting for the starters to become alive" )
167
- not_started = self .starter_instances [:] # This is a explicit copy
167
+ not_running = self .get_running_starters () # This is a explicit copy
168
168
count = 0
169
- while not_started :
170
- logging .debug ("waiting for mananger with logfile:" + str (not_started [- 1 ].log_file ))
171
- if not_started [- 1 ].is_instance_up ():
172
- not_started .pop ()
169
+ while not_running :
170
+ logging .debug ("waiting for mananger with logfile:" + str (not_running [- 1 ].log_file ))
171
+ if not_running [- 1 ].is_instance_up ():
172
+ not_running .pop ()
173
173
progress ("." )
174
174
time .sleep (1 )
175
175
count += 1
176
176
if count > 120 :
177
177
raise Exception ("Cluster installation didn't come up in two minutes!" )
178
178
179
179
logging .info ("waiting for the cluster instances to become alive" )
180
- for node in self .starter_instances :
180
+ for node in self .get_running_starters () :
181
181
node .detect_instances ()
182
182
node .detect_instance_pids ()
183
183
# self.cfg.add_frontend('http', self.cfg.publicip, str(node.get_frontend_port()))
184
184
185
185
logging .info ("instances are ready - JWT: " + self .starter_instances [0 ].get_jwt_header ())
186
186
count = 0
187
- for node in self .starter_instances :
187
+ for node in self .get_running_starters () :
188
188
node .set_passvoid ("cluster" , count == 0 )
189
189
count += 1
190
190
self .passvoid = "cluster"
191
191
192
192
def finish_setup_impl (self ):
193
- self .makedata_instances = self .starter_instances [:]
193
+ self .makedata_instances = self .get_running_starters ()
194
194
self .set_frontend_instances ()
195
195
196
196
def _check_for_shards_in_sync (self ):
@@ -483,12 +483,12 @@ def jam_attempt_impl(self):
483
483
# After attempt of jamming, we have peer for nodeX in setup.json.
484
484
# This peer will brake further updates because this peer is unavailable.
485
485
# It is necessary to remove this peer from json for each starter instance
486
- for instance in self .starter_instances :
486
+ for instance in self .get_running_starters () :
487
487
remove_node_x_from_json (instance .basedir )
488
488
489
489
def shutdown_impl (self ):
490
490
ret = False
491
- for node in self .starter_instances :
491
+ for node in self .get_running_starters () :
492
492
ret = ret or node .terminate_instance ()
493
493
logging .info ("test ended" )
494
494
return ret
@@ -528,3 +528,80 @@ def generate_keyfile(self, keyfile):
528
528
"--host=localhost" ,
529
529
]
530
530
)
531
+
532
+ @step
533
+ def test_hotbackup_impl (self ):
534
+ """ test hotbackup feature: Cluster """
535
+ # step 1: create a backup
536
+ self .create_backup_and_upload ("thy_name_is_" + self .name )
537
+ backup_from_step_1 = self .uploaded_backups [- 1 ]
538
+
539
+ # step 2: create non-backup data
540
+ self .create_non_backup_data ()
541
+ self .tcp_ping_all_nodes ()
542
+
543
+ # step 3: add new db server
544
+ new_starter = self .get_not_running_starters ()[0 ]
545
+ self .run_starter_and_wait (new_starter )
546
+
547
+ # step 4: create a backup
548
+ self .create_backup_and_upload ("thy_name_is_" + self .name + "_+1_server" )
549
+ backup_from_step_4 = self .uploaded_backups [- 1 ]
550
+
551
+ # step 5: remove old db server
552
+ terminate_instance = None
553
+ if not self .starter_instances [1 ].have_this_instance (self .agency .get_leader ()):
554
+ terminate_instance = self .starter_instances [1 ]
555
+ else :
556
+ terminate_instance = self .starter_instances [2 ]
557
+ terminate_instance .stop_dbserver ()
558
+
559
+ # step 6: create another backup
560
+ self .create_backup_and_upload ("thy_name_is_" + self .name + "_+1_server_-1server" )
561
+
562
+ # step 7: download and restore backup from step 1
563
+ self .download_backup (backup_from_step_1 )
564
+ self .validate_local_backup (backup_from_step_1 )
565
+ backups = self .list_backup ()
566
+ if backups [- 1 ] != backup_from_step_1 :
567
+ raise Exception ("downloaded backup has different name? " + str (backups ))
568
+ self .restore_backup (backup_from_step_1 )
569
+ self .tcp_ping_all_nodes ()
570
+
571
+ # step 8: check data
572
+ self .check_data_impl ()
573
+ if not self .check_non_backup_data ():
574
+ raise Exception ("data created after backup is still there??" )
575
+
576
+ # step 9: add new db server
577
+ new_starter2 = self .get_not_running_starters ()[0 ]
578
+ self .run_starter_and_wait (new_starter2 )
579
+
580
+ # step 10: download and restore backup from step 4
581
+ self .download_backup (backup_from_step_4 )
582
+ self .validate_local_backup (backup_from_step_4 )
583
+ backups = self .list_backup ()
584
+ if backups [- 1 ] != backup_from_step_4 :
585
+ raise Exception ("downloaded backup has different name? " + str (backups ))
586
+ self .restore_backup (backup_from_step_4 )
587
+ self .tcp_ping_all_nodes ()
588
+
589
+ # step 11: check data
590
+ self .check_data_impl ()
591
+ if not self .check_non_backup_data ():
592
+ raise Exception ("data created after backup is still there??" )
593
+
594
+ @staticmethod
595
+ def run_starter_and_wait (starter ):
596
+ starter .run_starter ()
597
+ count = 0
598
+ while not starter .is_instance_up ():
599
+ logging .debug ("waiting for mananger with logfile:" + str (starter .log_file ))
600
+ progress ("." )
601
+ time .sleep (1 )
602
+ count += 1
603
+ if count > 120 :
604
+ raise Exception ("Starter manager installation didn't come up in two minutes!" )
605
+ starter .detect_instances ()
606
+ starter .detect_instance_pids ()
607
+
0 commit comments