diff --git a/cron/client.py b/cron/client.py index a966bd283..7da3c75b2 100644 --- a/cron/client.py +++ b/cron/client.py @@ -21,7 +21,7 @@ from lib.configuration_check_error import ConfigurationCheckError, Status # We currently have this dynamically as it will probably change quite a bit -STATUS_LIST = ['cooldown', 'warmup', 'job_no', 'job_start', 'job_error', 'job_end', 'cleanup_start', 'cleanup_end', 'measurement_control_start', 'measurement_control_end', 'measurement_control_error'] +STATUS_LIST = ['cooldown', 'warmup', 'job_no', 'job_start', 'job_error', 'job_end', 'maintenance_start', 'maintenance_end', 'measurement_control_start', 'measurement_control_end', 'measurement_control_error'] CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) def set_status(status_code, cur_temp, cooldown_time_after_job, data=None, run_id=None): @@ -58,17 +58,17 @@ def set_status(status_code, cur_temp, cooldown_time_after_job, data=None, run_id ) DB().query(query=query, params=params) -def do_cleanup(cur_temp, cooldown_time_after_job): - set_status('cleanup_start', cur_temp, cooldown_time_after_job) +def do_maintenance(cur_temp, cooldown_time_after_job): + set_status('maintenance_start', cur_temp, cooldown_time_after_job) - result = subprocess.run(['sudo', - os.path.join(os.path.dirname(os.path.abspath(__file__)),'../tools/cluster/cleanup.sh')], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - check=True,) + result = subprocess.check_output(['sudo', os.path.join(os.path.dirname(os.path.abspath(__file__)),'../tools/cluster/cleanup.py')], encoding='UTF-8') - set_status('cleanup_end', cur_temp, cooldown_time_after_job, data=f"stdout: {result.stdout}, stderr: {result.stderr}") + set_status('maintenance_end', cur_temp, cooldown_time_after_job, data=result) + if '<<<< NO PACKAGES UPDATED - NO NEED TO RUN VALIDATION WORKLOAD >>>>' not in result: + return True # must run validation workload again. New packages installed + + return None if __name__ == '__main__': try: @@ -93,8 +93,14 @@ def do_cleanup(cur_temp, cooldown_time_after_job): last_cooldown_time = 0 current_temperature = -1 temperature_errors = 0 + must_revalidated_bc_new_packages = False while True: + + # run periodic cleanup in between every run + if not args.testing: + must_revalidated_bc_new_packages = do_maintenance(current_temperature, last_cooldown_time) # when new packages are installed, we must revalidate + job = Job.get_job('run') if job and job.check_job_running(): error_helpers.log_error('Job is still running. This is usually an error case! Continuing for now ...', machine=config_main['machine']['description']) @@ -134,7 +140,8 @@ def do_cleanup(cur_temp, cooldown_time_after_job): last_cooldown_time = cooldown_time cooldown_time = 0 - if not args.testing and validate.is_validation_needed(config_main['machine']['id'], client_main['time_between_control_workload_validations']): + + if not args.testing and (must_revalidated_bc_new_packages or validate.is_validation_needed(config_main['machine']['id'], client_main['time_between_control_workload_validations'])): set_status('measurement_control_start', current_temperature, last_cooldown_time) validate.run_workload(cwl['name'], cwl['uri'], cwl['filename'], cwl['branch']) set_status('measurement_control_end', current_temperature, last_cooldown_time) @@ -152,6 +159,7 @@ def do_cleanup(cur_temp, cooldown_time_after_job): name=f"{config_main['machine']['description']} is operating normally. All STDDEV fine.", message='\n'.join(message) ) + must_revalidated_bc_new_packages = False # reset after run except Exception as exception: # pylint: disable=broad-except validate.handle_validate_exception(exception) set_status('measurement_control_error', current_temperature, last_cooldown_time) @@ -160,8 +168,6 @@ def do_cleanup(cur_temp, cooldown_time_after_job): # endlessly in validation until manually handled, which is what we want. if not args.testing: time.sleep(client_main['time_between_control_workload_validations']) - finally: - do_cleanup(current_temperature, last_cooldown_time) elif job: set_status('job_start', current_temperature, last_cooldown_time, run_id=job._run_id) @@ -185,14 +191,12 @@ def do_cleanup(cur_temp, cooldown_time_after_job): except Exception as exc: # pylint: disable=broad-except set_status('job_error', current_temperature, last_cooldown_time, data=str(exc), run_id=job._run_id) error_helpers.log_error('Job processing in cluster failed (client.py)', exception=exc, previous_exception=exc.__context__, run_id=job._run_id, machine=config_main['machine']['description'], name=job._name, url=job._url) - finally: - if not args.testing: - do_cleanup(current_temperature, last_cooldown_time) else: - do_cleanup(current_temperature, last_cooldown_time) set_status('job_no', current_temperature, last_cooldown_time) if client_main['shutdown_on_job_no'] is True: + subprocess.check_output(['sync']) + time.sleep(60) # sleep for 60 before going to suspend to allow logins to cluster when systems are fresh rebooted for maintenance subprocess.check_output(['sudo', 'systemctl', 'suspend']) if not args.testing: time.sleep(client_main['sleep_time_no_job']) diff --git a/lib/system_checks.py b/lib/system_checks.py index 23ad4974d..5584026cd 100644 --- a/lib/system_checks.py +++ b/lib/system_checks.py @@ -47,6 +47,16 @@ def check_one_energy_and_scope_machine_provider(): def check_tmpfs_mount(): return not any(partition.mountpoint == '/tmp' and partition.fstype != 'tmpfs' for partition in psutil.disk_partitions()) +def check_ntp(): + if platform.system() == 'Darwin': # no NTP for darwin, as this is linux cluster only functionality + return True + + ntp_status = subprocess.check_output(['timedatectl', '-a'], encoding='UTF-8') + if 'System clock synchronized: no' not in ntp_status or 'NTP service: inactive' not in ntp_status: + return False + + return True + def check_cpu_utilization(): return psutil.cpu_percent(0.1) < 5.0 @@ -58,11 +68,8 @@ def check_free_memory(): return psutil.virtual_memory().available >= GMT_Resources['free_memory'] def check_containers_running(): - result = subprocess.run(['docker', 'ps', '--format', '{{.Names}}'], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - check=True, encoding='UTF-8') - return not bool(result.stdout.strip()) + result = subprocess.check_output(['docker', 'ps', '--format', '{{.Names}}'], encoding='UTF-8') + return not bool(result.strip()) def check_docker_daemon(): result = subprocess.run(['docker', 'version'], @@ -96,6 +103,7 @@ def check_swap_disabled(): (check_db, Status.ERROR, 'db online', 'This text will never be triggered, please look in the function itself'), (check_one_energy_and_scope_machine_provider, Status.ERROR, 'single energy scope machine provider', 'Please only select one provider with energy and scope machine'), (check_tmpfs_mount, Status.INFO, 'tmpfs mount', 'We recommend to mount tmp on tmpfs'), + (check_ntp, Status.WARN, 'ntp', 'You have NTP time syncing active. This can create noise in runs and should be deactivated.'), (check_cpu_utilization, Status.WARN, '< 5% CPU utilization', 'Your system seems to be busy. Utilization is above 5%. Consider terminating some processes for a more stable measurement.'), (check_free_disk, Status.ERROR, '1 GiB free hdd space', 'We recommend to free up some disk space (< 1GiB available)'), (check_free_memory, Status.ERROR, '1 GiB free memory', 'No free memory! Please kill some programs (< 1GiB available)'), diff --git a/tools/cluster/cleanup_original.py b/tools/cluster/cleanup_original.py new file mode 100644 index 000000000..d69784a66 --- /dev/null +++ b/tools/cluster/cleanup_original.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +import sys +import faulthandler +faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr + +import os +import time +import subprocess + +# We can NEVER include non system packages here, as we rely on them all being writeable by root only. +# This will only be true for non-venv pure system packages coming with the python distribution of the OS + +# always +subprocess.check_output(['sudo', '/usr/libexec/dpkg/dpkg-db-backup']) + +subprocess.check_output(['sudo', '/sbin/e2scrub_all']) + +subprocess.check_output(['sudo', '/sbin/fstrim', '--listed-in', '/etc/fstab:/proc/self/mountinfo', '--verbose', '--quiet-unsupported']) + +subprocess.check_output(['sudo', '/systemd-tmpfiles', '--clean']) + +subprocess.check_output(['sudo', '/usr/sbin/logrotate', '/etc/logrotate.conf']) + +subprocess.check_output(['sudo', 'journalctl', '--flush']) + +## Update time +# may throw exception, but we need to check if time sync calls work, as we do not know what the actual time is +# Typically in cluster installations port 123 is blocked and a local time server is available. Thus the guard function here +subprocess.check_output(['sudo', 'timedatectl', 'set-ntp', 'true']) # this will trigger immediate update +ntp_status = subprocess.check_output(['timedatectl', '-a'], encoding='UTF-8') +if 'System clock synchronized: yes' not in ntp_status or 'NTP service: active' not in ntp_status: + raise RuntimeError('System clock could not be synchronized', ntp_status=ntp_status) + +result = subprocess.check_output(['sudo', 'timedatectl', 'set-ntp', 'false']) # we want NTP always off in clusters +ntp_status = subprocess.check_output(['timedatectl', '-a'], encoding='UTF-8') +if 'System clock synchronized: no' not in ntp_status or 'NTP service: inactive' not in ntp_status: + raise RuntimeError('System clock synchronization could not be turned off', ntp_status=ntp_status) + +## Do APT last, as we want to insert the Changelog +apt_packages_upgrade = None +now = time.time() +if (not os.path.exists('/var/log/apt/history.log')) or ((now - os.path.getmtime('/var/log/apt/history.log')) > 86400): + + print("history.log is older than 24 hours") + subprocess.check_output(['sudo', 'apt', 'update']) + + apt_packages_upgrade = subprocess.check_output(['apt', 'list', '--upgradable']) + + subprocess.check_output(['sudo', 'apt', 'full-upgrade', '-y']) + +if apt_packages_upgrade: + print('<<<< UPDATED APT PACKAGES >>>>') + print(apt_packages_upgrade) + print('<<<< END UPDATED APT PACKAGES >>>>') + +else: + print('<<<< NO PACKAGES UPDATED - NO NEED TO RUN VALIDATION WORKLOAD >>>>') diff --git a/tools/cluster/cleanup_original.sh b/tools/cluster/cleanup_original.sh deleted file mode 100644 index a5fae7f3d..000000000 --- a/tools/cluster/cleanup_original.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -echo "apt-daily-upgrade" -/usr/lib/apt/apt.systemd.daily update - -echo "apt-daily" -/usr/lib/apt/apt.systemd.daily install - -echo "dpkg-db-backup" -/usr/libexec/dpkg/dpkg-db-backup - -echo "e2scrub_all" -/sbin/e2scrub_all - -echo "fstrim" -/sbin/fstrim --listed-in /etc/fstab:/proc/self/mountinfo --verbose --quiet-unsupported - -echo "systemd-tmpfiles-clean" -systemd-tmpfiles --clean - -echo "logrotate" -/usr/sbin/logrotate /etc/logrotate.conf - -echo "systemd-journal-flush" -journalctl --flush