Skip to content

Cluster Cleanup Rework #1155

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 20 additions & 16 deletions cron/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from lib.configuration_check_error import ConfigurationCheckError, Status

# We currently have this dynamically as it will probably change quite a bit
STATUS_LIST = ['cooldown', 'warmup', 'job_no', 'job_start', 'job_error', 'job_end', 'cleanup_start', 'cleanup_end', 'measurement_control_start', 'measurement_control_end', 'measurement_control_error']
STATUS_LIST = ['cooldown', 'warmup', 'job_no', 'job_start', 'job_error', 'job_end', 'maintenance_start', 'maintenance_end', 'measurement_control_start', 'measurement_control_end', 'measurement_control_error']
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))

def set_status(status_code, cur_temp, cooldown_time_after_job, data=None, run_id=None):
Expand Down Expand Up @@ -58,17 +58,17 @@ def set_status(status_code, cur_temp, cooldown_time_after_job, data=None, run_id
)
DB().query(query=query, params=params)

def do_cleanup(cur_temp, cooldown_time_after_job):
set_status('cleanup_start', cur_temp, cooldown_time_after_job)
def do_maintenance(cur_temp, cooldown_time_after_job):
set_status('maintenance_start', cur_temp, cooldown_time_after_job)

result = subprocess.run(['sudo',
os.path.join(os.path.dirname(os.path.abspath(__file__)),'../tools/cluster/cleanup.sh')],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,)
result = subprocess.check_output(['sudo', os.path.join(os.path.dirname(os.path.abspath(__file__)),'../tools/cluster/cleanup.py')], encoding='UTF-8')

set_status('cleanup_end', cur_temp, cooldown_time_after_job, data=f"stdout: {result.stdout}, stderr: {result.stderr}")
set_status('maintenance_end', cur_temp, cooldown_time_after_job, data=result)

if '<<<< NO PACKAGES UPDATED - NO NEED TO RUN VALIDATION WORKLOAD >>>>' not in result:
return True # must run validation workload again. New packages installed

return None

if __name__ == '__main__':
try:
Expand All @@ -93,8 +93,14 @@ def do_cleanup(cur_temp, cooldown_time_after_job):
last_cooldown_time = 0
current_temperature = -1
temperature_errors = 0
must_revalidated_bc_new_packages = False

while True:

# run periodic cleanup in between every run
if not args.testing:
must_revalidated_bc_new_packages = do_maintenance(current_temperature, last_cooldown_time) # when new packages are installed, we must revalidate

job = Job.get_job('run')
if job and job.check_job_running():
error_helpers.log_error('Job is still running. This is usually an error case! Continuing for now ...', machine=config_main['machine']['description'])
Expand Down Expand Up @@ -134,7 +140,8 @@ def do_cleanup(cur_temp, cooldown_time_after_job):
last_cooldown_time = cooldown_time
cooldown_time = 0

if not args.testing and validate.is_validation_needed(config_main['machine']['id'], client_main['time_between_control_workload_validations']):

if not args.testing and (must_revalidated_bc_new_packages or validate.is_validation_needed(config_main['machine']['id'], client_main['time_between_control_workload_validations'])):
set_status('measurement_control_start', current_temperature, last_cooldown_time)
validate.run_workload(cwl['name'], cwl['uri'], cwl['filename'], cwl['branch'])
set_status('measurement_control_end', current_temperature, last_cooldown_time)
Expand All @@ -152,6 +159,7 @@ def do_cleanup(cur_temp, cooldown_time_after_job):
name=f"{config_main['machine']['description']} is operating normally. All STDDEV fine.",
message='\n'.join(message)
)
must_revalidated_bc_new_packages = False # reset after run
except Exception as exception: # pylint: disable=broad-except
validate.handle_validate_exception(exception)
set_status('measurement_control_error', current_temperature, last_cooldown_time)
Expand All @@ -160,8 +168,6 @@ def do_cleanup(cur_temp, cooldown_time_after_job):
# endlessly in validation until manually handled, which is what we want.
if not args.testing:
time.sleep(client_main['time_between_control_workload_validations'])
finally:
do_cleanup(current_temperature, last_cooldown_time)

elif job:
set_status('job_start', current_temperature, last_cooldown_time, run_id=job._run_id)
Expand All @@ -185,14 +191,12 @@ def do_cleanup(cur_temp, cooldown_time_after_job):
except Exception as exc: # pylint: disable=broad-except
set_status('job_error', current_temperature, last_cooldown_time, data=str(exc), run_id=job._run_id)
error_helpers.log_error('Job processing in cluster failed (client.py)', exception=exc, previous_exception=exc.__context__, run_id=job._run_id, machine=config_main['machine']['description'], name=job._name, url=job._url)
finally:
if not args.testing:
do_cleanup(current_temperature, last_cooldown_time)

else:
do_cleanup(current_temperature, last_cooldown_time)
set_status('job_no', current_temperature, last_cooldown_time)
if client_main['shutdown_on_job_no'] is True:
subprocess.check_output(['sync'])
time.sleep(60) # sleep for 60 before going to suspend to allow logins to cluster when systems are fresh rebooted for maintenance
subprocess.check_output(['sudo', 'systemctl', 'suspend'])
if not args.testing:
time.sleep(client_main['sleep_time_no_job'])
Expand Down
18 changes: 13 additions & 5 deletions lib/system_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@ def check_one_energy_and_scope_machine_provider():
def check_tmpfs_mount():
return not any(partition.mountpoint == '/tmp' and partition.fstype != 'tmpfs' for partition in psutil.disk_partitions())

def check_ntp():
if platform.system() == 'Darwin': # no NTP for darwin, as this is linux cluster only functionality
return True

ntp_status = subprocess.check_output(['timedatectl', '-a'], encoding='UTF-8')
if 'System clock synchronized: no' not in ntp_status or 'NTP service: inactive' not in ntp_status:
return False

return True

def check_cpu_utilization():
return psutil.cpu_percent(0.1) < 5.0

Expand All @@ -58,11 +68,8 @@ def check_free_memory():
return psutil.virtual_memory().available >= GMT_Resources['free_memory']

def check_containers_running():
result = subprocess.run(['docker', 'ps', '--format', '{{.Names}}'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True, encoding='UTF-8')
return not bool(result.stdout.strip())
result = subprocess.check_output(['docker', 'ps', '--format', '{{.Names}}'], encoding='UTF-8')
return not bool(result.strip())

def check_docker_daemon():
result = subprocess.run(['docker', 'version'],
Expand Down Expand Up @@ -96,6 +103,7 @@ def check_swap_disabled():
(check_db, Status.ERROR, 'db online', 'This text will never be triggered, please look in the function itself'),
(check_one_energy_and_scope_machine_provider, Status.ERROR, 'single energy scope machine provider', 'Please only select one provider with energy and scope machine'),
(check_tmpfs_mount, Status.INFO, 'tmpfs mount', 'We recommend to mount tmp on tmpfs'),
(check_ntp, Status.WARN, 'ntp', 'You have NTP time syncing active. This can create noise in runs and should be deactivated.'),
(check_cpu_utilization, Status.WARN, '< 5% CPU utilization', 'Your system seems to be busy. Utilization is above 5%. Consider terminating some processes for a more stable measurement.'),
(check_free_disk, Status.ERROR, '1 GiB free hdd space', 'We recommend to free up some disk space (< 1GiB available)'),
(check_free_memory, Status.ERROR, '1 GiB free memory', 'No free memory! Please kill some programs (< 1GiB available)'),
Expand Down
57 changes: 57 additions & 0 deletions tools/cluster/cleanup_original.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env python3
import sys
import faulthandler
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

import os
import time
import subprocess

# We can NEVER include non system packages here, as we rely on them all being writeable by root only.
# This will only be true for non-venv pure system packages coming with the python distribution of the OS

# always
subprocess.check_output(['sudo', '/usr/libexec/dpkg/dpkg-db-backup'])

subprocess.check_output(['sudo', '/sbin/e2scrub_all'])

subprocess.check_output(['sudo', '/sbin/fstrim', '--listed-in', '/etc/fstab:/proc/self/mountinfo', '--verbose', '--quiet-unsupported'])

subprocess.check_output(['sudo', '/systemd-tmpfiles', '--clean'])

subprocess.check_output(['sudo', '/usr/sbin/logrotate', '/etc/logrotate.conf'])

subprocess.check_output(['sudo', 'journalctl', '--flush'])

## Update time
# may throw exception, but we need to check if time sync calls work, as we do not know what the actual time is
# Typically in cluster installations port 123 is blocked and a local time server is available. Thus the guard function here
subprocess.check_output(['sudo', 'timedatectl', 'set-ntp', 'true']) # this will trigger immediate update
ntp_status = subprocess.check_output(['timedatectl', '-a'], encoding='UTF-8')
if 'System clock synchronized: yes' not in ntp_status or 'NTP service: active' not in ntp_status:
raise RuntimeError('System clock could not be synchronized', ntp_status=ntp_status)

result = subprocess.check_output(['sudo', 'timedatectl', 'set-ntp', 'false']) # we want NTP always off in clusters
ntp_status = subprocess.check_output(['timedatectl', '-a'], encoding='UTF-8')
if 'System clock synchronized: no' not in ntp_status or 'NTP service: inactive' not in ntp_status:
raise RuntimeError('System clock synchronization could not be turned off', ntp_status=ntp_status)

## Do APT last, as we want to insert the Changelog
apt_packages_upgrade = None
now = time.time()
if (not os.path.exists('/var/log/apt/history.log')) or ((now - os.path.getmtime('/var/log/apt/history.log')) > 86400):

print("history.log is older than 24 hours")
subprocess.check_output(['sudo', 'apt', 'update'])

apt_packages_upgrade = subprocess.check_output(['apt', 'list', '--upgradable'])

subprocess.check_output(['sudo', 'apt', 'full-upgrade', '-y'])

if apt_packages_upgrade:
print('<<<< UPDATED APT PACKAGES >>>>')
print(apt_packages_upgrade)
print('<<<< END UPDATED APT PACKAGES >>>>')

else:
print('<<<< NO PACKAGES UPDATED - NO NEED TO RUN VALIDATION WORKLOAD >>>>')
25 changes: 0 additions & 25 deletions tools/cluster/cleanup_original.sh

This file was deleted.

Loading