Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
3.15.0
------

**CHANGES**
- Replace cfn-hup in compute nodes with systemd timers to support in place updates.

3.14.1
------

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[Unit]
Description=Check file modification time every minute

[Timer]
AccuracySec=1s
OnActiveSec=60sec
OnUnitActiveSec=60sec
Unit=check-update.service

[Install]
WantedBy=timers.target
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,7 @@
# limitations under the License.

include_recipe 'aws-parallelcluster-computefleet::fleet_status'

if ['ComputeFleet'].include?(node['cluster']['node_type'])
include_recipe 'aws-parallelcluster-computefleet::config_check_update_systemd_service'
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# frozen_string_literal: true

#
# Cookbook:: aws-parallelcluster-slurm
# Recipe:: config_compute
#
# Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

template '/etc/systemd/system/check-update.service' do
source 'check_update/check-update.service.erb'
owner 'root'
group 'root'
mode '0644'
end

cookbook_file '/etc/systemd/system/check-update.timer' do
source 'check_update/check-update.timer'
owner 'root'
group 'root'
mode '0644'
action :create
end

file node['cluster']['shared_update_path'] do
content ''
owner 'root'
group 'root'
mode '0644'
action :create_if_missing
end

file node['cluster']['update_checkpoint'] do
content ''
owner 'root'
group 'root'
mode '0644'
action :create_if_missing
end

service 'check-update.timer' do
action [:enable, :start]
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
require 'spec_helper'

describe 'aws-parallelcluster-computefleet::config_check_update_systemd_service' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['node_type'] = 'ComputeFleet'
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'creates the check-update.service template' do
is_expected.to create_template('/etc/systemd/system/check-update.service')
.with(source: 'check_update/check-update.service.erb')
.with(owner: 'root')
.with(group: 'root')
.with(mode: '0644')
end

it 'creates the check-update.timer file' do
is_expected.to create_cookbook_file('/etc/systemd/system/check-update.timer')
.with(source: 'check_update/check-update.timer')
.with(owner: 'root')
.with(group: 'root')
.with(mode: '0644')
end

it 'creates the shared update path file if missing' do
is_expected.to create_file_if_missing(node['cluster']['shared_update_path'])
.with(content: '')
.with(owner: 'root')
.with(group: 'root')
.with(mode: '0644')
end

it 'creates the local update checkpoint file if missing' do
is_expected.to create_file_if_missing(node['cluster']['update_checkpoint'])
.with(content: '')
.with(owner: 'root')
.with(group: 'root')
.with(mode: '0644')
end

it 'enables and starts the check-update.timer service' do
is_expected.to enable_service('check-update.timer')
is_expected.to start_service('check-update.timer')
end

describe 'check-update.service template content' do
it 'has Type=oneshot to prevent concurrent executions' do
is_expected.to render_file('/etc/systemd/system/check-update.service')
.with_content('Type=oneshot')
end

it 'has TimeoutStartSec=30 to handle NFS hangs' do
is_expected.to render_file('/etc/systemd/system/check-update.service')
.with_content('TimeoutStartSec=30')
end

it 'exits gracefully if shared file does not exist' do
is_expected.to render_file('/etc/systemd/system/check-update.service')
.with_content('[ ! -f "$SHARED_FILE" ] && exit 0')
end

it 'exits gracefully if cat fails' do
is_expected.to render_file('/etc/systemd/system/check-update.service')
.with_content('CURRENT_UPDATE=$(cat "$SHARED_FILE") || exit 0')
end

it 'writes checkpoint before running update action' do
is_expected.to render_file('/etc/systemd/system/check-update.service')
.with_content('echo "$CURRENT_UPDATE" > "$LOCAL_CHECKPOINT" && ')
end

it 'references the correct shared update path' do
is_expected.to render_file('/etc/systemd/system/check-update.service')
.with_content("SHARED_FILE=\"#{node['cluster']['shared_update_path']}\"")
end

it 'references the correct local checkpoint path' do
is_expected.to render_file('/etc/systemd/system/check-update.service')
.with_content("LOCAL_CHECKPOINT=\"#{node['cluster']['update_checkpoint']}\"")
end

it 'calls cfn-hup-update-action.sh when update is needed' do
is_expected.to render_file('/etc/systemd/system/check-update.service')
.with_content("#{node['cluster']['scripts_dir']}/cfn-hup-update-action.sh")
end
end
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[Unit]
Description=Check for recent file modifications

[Service]
Type=oneshot
TimeoutStartSec=30
ExecStart=/bin/bash -c '\
SHARED_FILE="<%= node['cluster']['shared_update_path'] %>"; \
LOCAL_CHECKPOINT="<%= node['cluster']['update_checkpoint'] %>"; \
\
[ ! -f "$SHARED_FILE" ] && exit 0; \
\
CURRENT_UPDATE=$(cat "$SHARED_FILE") || exit 0; \
LAST_APPLIED=$([ -f "$LOCAL_CHECKPOINT" ] && cat "$LOCAL_CHECKPOINT" || echo ""); \
\
if [ "$CURRENT_UPDATE" != "$LAST_APPLIED" ]; then \
echo "$CURRENT_UPDATE" > "$LOCAL_CHECKPOINT" && <%= node['cluster']['scripts_dir'] %>/cfn-hup-update-action.sh; \
fi'

[Install]
WantedBy=multi-user.target
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@ triggers=post.update
<% case node['cluster']['node_type'] -%>
<% when 'HeadNode', 'LoginNode' -%>
path=Resources.<%= @launch_template_resource_id %>.Metadata.AWS::CloudFormation::Init
action=PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; . /etc/parallelcluster/pcluster_cookbook_environment.sh; $CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -v --stack <%= @stack_id %> --resource <%= @launch_template_resource_id %> --configsets update --region <%= @region %> --url <%= @cloudformation_url %> --role <%= @cfn_init_role %>
<% when 'ComputeFleet' -%>
path=Resources.<%= @launch_template_resource_id %>
action=timeout <%= @node_bootstrap_timeout %> <%= @update_hook_script_dir %>/cfn-hup-update-action.sh
action=PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin;. /etc/parallelcluster/pcluster_cookbook_environment.sh; $CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -v --stack <%= @stack_id %> --resource <%= @launch_template_resource_id %> --configsets update --region <%= @region %> --url <%= @cloudformation_url %> --role <%= @cfn_init_role %>
<% end %>
runas=root
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
dcv_port: node['cluster']['dcv_port'],
dcv_auth_certificate: node['cluster']['dcv']['authenticator']['certificate'],
dcv_auth_private_key: node['cluster']['dcv']['authenticator']['private_key'],
dcv_auth_user: node['cluster']['dcv']['authenticator']['user'],
cfnhup_enabled: cfnhup_enabled?
dcv_auth_user: node['cluster']['dcv']['authenticator']['user']
)
end
Original file line number Diff line number Diff line change
Expand Up @@ -57,28 +57,6 @@
end
end

context "when head node and cfn-hup disabled on fleet" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['node_type'] = 'HeadNode'
node.override['cluster']['dcv_enabled'] = 'head_node'
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false'
allow_any_instance_of(Object).to receive(:dcv_installed?).and_return(true)
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'has the correct content' do
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:cfn-hup]")
.with_content("[program:clustermgtd]")
.with_content("[program:clusterstatusmgtd]")
.with_content("[program:pcluster_dcv_authenticator]")
.with_content("--port 8444")
end
end

context "when compute fleet" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
Expand All @@ -92,32 +70,16 @@

it 'has the correct content' do
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:cfn-hup]")
.with_content("[program:computemgtd]")

is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:pcluster_dcv_authenticator]")
end
end

context "when compute fleet with cfn-hup disabled on fleet" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['node_type'] = 'ComputeFleet'
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false'
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'has the correct content' do
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:computemgtd]")

is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:cfn-hup]")
end
end

context "when login node and dcv configured" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
Expand Down Expand Up @@ -157,25 +119,6 @@
.with_content("[program:pcluster_dcv_authenticator]")
end
end

context "when login node with cfn-hup disabled on fleet" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['node_type'] = 'LoginNode'
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false'
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'has the correct content' do
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:loginmgtd]")

is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:cfn-hup]")
end
end
end
end
end
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Generated by Chef for AWS ParallelCluster <%= node['cluster']['node_type'] -%>
# Local modifications could be overwritten.
<%# HeadNode, ComputeFleet, LoginNode -%>
<% if @cfnhup_enabled -%>
<% case node['cluster']['node_type'] -%>
<% when 'HeadNode', 'LoginNode' -%>
[program:cfn-hup]
command = <%= node['cluster']['scripts_dir']%>/cfn-hup-runner.sh
autorestart = true
Expand Down
7 changes: 4 additions & 3 deletions cookbooks/aws-parallelcluster-shared/attributes/cluster.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
default['cluster']['log_base_dir'] = '/var/log/parallelcluster'
default['cluster']['etc_dir'] = '/etc/parallelcluster'

# Shared file used to manage inplace updates
default['cluster']['shared_update_path'] = "#{node['cluster']['shared_dir']}/check_update"
default['cluster']['update_checkpoint'] = "#{node['cluster']['scripts_dir']}/update_checkpoint"

# Slurm_plugin_dir is used by slurm cookbook and custom_actions recipe
default['cluster']['slurm_plugin_dir'] = "#{node['cluster']['etc_dir']}/slurm_plugin"

Expand All @@ -34,6 +38,3 @@

# Default NFS mount options
default['cluster']['nfs']['hard_mount_options'] = 'hard,_netdev,noatime'

# Cluster Updates
default['cluster']['in_place_update_on_fleet_enabled'] = 'true'
11 changes: 0 additions & 11 deletions cookbooks/aws-parallelcluster-shared/libraries/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -106,14 +106,3 @@ def wait_sync_file(path)
timeout 5
end
end

def cfnhup_enabled?
# cfn-hup is always enabled on the head node, as it is required to perform cluster updates.
# cfn-hup can be disabled on compute nodes and login nodes, limiting the cluster update in the sense that
# live updates on compute and login nodes are not possible.
node['cluster']['node_type'] == 'HeadNode' || node['cluster']['in_place_update_on_fleet_enabled'] == 'true'
end

def cluster_readiness_check_on_update_enabled?
node['cluster']['in_place_update_on_fleet_enabled'] == 'true'
end

This file was deleted.

Loading
Loading