Skip to content

Commit 579545a

Browse files
committed
add special hotbackup testing scenario for cluster deployment
1 parent fd8d3ba commit 579545a

File tree

3 files changed

+169
-57
lines changed

3 files changed

+169
-57
lines changed

release_tester/arangodb/starter/deployments/cluster.py

+90-13
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,8 @@ def add_starter(name, port, opts, sm, hasAgency):
141141
self.create_tls_ca_cert()
142142
port = 9528
143143
count = 0
144-
for this_node in list(range(1, self.props.cluster_nodes + 1)):
144+
full_node_count = self.props.cluster_nodes + 2 # we need 2 additional nodes for hotbackup testing
145+
for this_node in list(range(1, full_node_count + 1)):
145146
node = []
146147
node_opts.append(node)
147148
if this_node != 1:
@@ -153,44 +154,43 @@ def add_starter(name, port, opts, sm, hasAgency):
153154
add_starter(f"node{this_node}", port, node + common_opts, sm, count < 3)
154155
port += 100
155156
count += 1
156-
self.backup_instance_count = count
157157
for instance in self.starter_instances:
158158
instance.is_leader = True
159159

160160
def starter_run_impl(self):
161161
lh.subsection("instance setup")
162-
for manager in self.starter_instances:
162+
for manager in self.starter_instances[:self.props.cluster_nodes]:
163163
logging.info("Spawning instance")
164164
manager.run_starter()
165165

166166
logging.info("waiting for the starters to become alive")
167-
not_started = self.starter_instances[:] # This is a explicit copy
167+
not_running = self.get_running_starters() # This is a explicit copy
168168
count = 0
169-
while not_started:
170-
logging.debug("waiting for mananger with logfile:" + str(not_started[-1].log_file))
171-
if not_started[-1].is_instance_up():
172-
not_started.pop()
169+
while not_running:
170+
logging.debug("waiting for mananger with logfile:" + str(not_running[-1].log_file))
171+
if not_running[-1].is_instance_up():
172+
not_running.pop()
173173
progress(".")
174174
time.sleep(1)
175175
count += 1
176176
if count > 120:
177177
raise Exception("Cluster installation didn't come up in two minutes!")
178178

179179
logging.info("waiting for the cluster instances to become alive")
180-
for node in self.starter_instances:
180+
for node in self.get_running_starters():
181181
node.detect_instances()
182182
node.detect_instance_pids()
183183
# self.cfg.add_frontend('http', self.cfg.publicip, str(node.get_frontend_port()))
184184

185185
logging.info("instances are ready - JWT: " + self.starter_instances[0].get_jwt_header())
186186
count = 0
187-
for node in self.starter_instances:
187+
for node in self.get_running_starters():
188188
node.set_passvoid("cluster", count == 0)
189189
count += 1
190190
self.passvoid = "cluster"
191191

192192
def finish_setup_impl(self):
193-
self.makedata_instances = self.starter_instances[:]
193+
self.makedata_instances = self.get_running_starters()
194194
self.set_frontend_instances()
195195

196196
def _check_for_shards_in_sync(self):
@@ -483,12 +483,12 @@ def jam_attempt_impl(self):
483483
# After attempt of jamming, we have peer for nodeX in setup.json.
484484
# This peer will brake further updates because this peer is unavailable.
485485
# It is necessary to remove this peer from json for each starter instance
486-
for instance in self.starter_instances:
486+
for instance in self.get_running_starters():
487487
remove_node_x_from_json(instance.basedir)
488488

489489
def shutdown_impl(self):
490490
ret = False
491-
for node in self.starter_instances:
491+
for node in self.get_running_starters():
492492
ret = ret or node.terminate_instance()
493493
logging.info("test ended")
494494
return ret
@@ -528,3 +528,80 @@ def generate_keyfile(self, keyfile):
528528
"--host=localhost",
529529
]
530530
)
531+
532+
@step
533+
def test_hotbackup_impl(self):
534+
""" test hotbackup feature: Cluster """
535+
# step 1: create a backup
536+
self.create_backup_and_upload("thy_name_is_" + self.name)
537+
backup_from_step_1 = self.uploaded_backups[-1]
538+
539+
# step 2: create non-backup data
540+
self.create_non_backup_data()
541+
self.tcp_ping_all_nodes()
542+
543+
# step 3: add new db server
544+
new_starter = self.get_not_running_starters()[0]
545+
self.run_starter_and_wait(new_starter)
546+
547+
# step 4: create a backup
548+
self.create_backup_and_upload("thy_name_is_" + self.name + "_+1_server")
549+
backup_from_step_4 = self.uploaded_backups[-1]
550+
551+
# step 5: remove old db server
552+
terminate_instance = None
553+
if not self.starter_instances[1].have_this_instance(self.agency.get_leader()):
554+
terminate_instance = self.starter_instances[1]
555+
else:
556+
terminate_instance = self.starter_instances[2]
557+
terminate_instance.stop_dbserver()
558+
559+
# step 6: create another backup
560+
self.create_backup_and_upload("thy_name_is_" + self.name + "_+1_server_-1server")
561+
562+
# step 7: download and restore backup from step 1
563+
self.download_backup(backup_from_step_1)
564+
self.validate_local_backup(backup_from_step_1)
565+
backups = self.list_backup()
566+
if backups[-1] != backup_from_step_1:
567+
raise Exception("downloaded backup has different name? " + str(backups))
568+
self.restore_backup(backup_from_step_1)
569+
self.tcp_ping_all_nodes()
570+
571+
# step 8: check data
572+
self.check_data_impl()
573+
if not self.check_non_backup_data():
574+
raise Exception("data created after backup is still there??")
575+
576+
# step 9: add new db server
577+
new_starter2 = self.get_not_running_starters()[0]
578+
self.run_starter_and_wait(new_starter2)
579+
580+
# step 10: download and restore backup from step 4
581+
self.download_backup(backup_from_step_4)
582+
self.validate_local_backup(backup_from_step_4)
583+
backups = self.list_backup()
584+
if backups[-1] != backup_from_step_4:
585+
raise Exception("downloaded backup has different name? " + str(backups))
586+
self.restore_backup(backup_from_step_4)
587+
self.tcp_ping_all_nodes()
588+
589+
# step 11: check data
590+
self.check_data_impl()
591+
if not self.check_non_backup_data():
592+
raise Exception("data created after backup is still there??")
593+
594+
@staticmethod
595+
def run_starter_and_wait(starter):
596+
starter.run_starter()
597+
count = 0
598+
while not starter.is_instance_up():
599+
logging.debug("waiting for mananger with logfile:" + str(starter.log_file))
600+
progress(".")
601+
time.sleep(1)
602+
count += 1
603+
if count > 120:
604+
raise Exception("Starter manager installation didn't come up in two minutes!")
605+
starter.detect_instances()
606+
starter.detect_instance_pids()
607+

0 commit comments

Comments
 (0)