Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

aliZsync+alizsw prototype for P2 software distribution #200

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 192 additions & 0 deletions hacking/aliZsync
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
#!/bin/bash
# === This file is part of ALICE O² ===
#
# Copyright 2021 CERN and copyright holders of ALICE O².
# Author: Teo Mrnjavac <[email protected]>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license CERN does not waive the privileges and
# immunities granted to it by virtue of its status as an
# Intergovernmental Organization or submit itself to any jurisdiction.
# =============================================================================
#
# Quickstart:
# Install `zfs-kmod` as instructed here:
# https://openzfs.github.io/openzfs-docs/Getting%20Started/RHEL%20and%20CentOS.html
# $ sudo modprobe zfs
# Install aliBuild & general dependencies as per instructions:
# https://alice-doc.github.io/alice-analysis-tutorial/building/prereq-centos7.html
# Instead of running `aliBuild init`, create an empty directory, cd into it, then:
# $ aliZsync init
# $ cd . # MANDATORY because a new ZFS pool is created and
# mounted here, must reload directory
# Perform any alidist/aliDoctor/aliBuild operations.
# When ready to release:
# $ aliZsync tag <tag_name>
# Synchronize all tags in cluster:
# $ ALIZSYNC_INVENTORY=(default:/etc/o2.d/aliZsync_inventory) aliZsync sync

ProgName=$(basename $0)

POOLNAME="${ALIZSYNC_POOL_NAME:-aliZsync}"
POOLSIZE="${ALIZSYNC_POOL_SIZE:-100G}"
IMG_FILE_PATH="${ALIZSYNC_IMG_FILE_PATH:-$HOME/$POOLNAME.img}"
TRANSPORT="${ALIZSYNC_TRANSPORT:-mbuffer}"

INVENTORY_FILE="${ALIZSYNC_INVENTORY:-/etc/o2.d/aliZsync_inventory}"
TARGET_ROOT="${ALIZSYNC_TARGET_ROOT:-/opt/alizsw}"
TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
TAG_NAME="${2:-$TIMESTAMP}"
N_WORKERS="${ALIZSYNC_WORKERS:-10}"

DATASET_SW="$POOLNAME/sw"
DATASET_BINARIES="$DATASET_SW/slc7_x86-64"
DATASET_MODULES="$DATASET_SW/MODULES"

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
SYNC_CMD="$SCRIPT_DIR/aliZsync-sync-zfs-snapshots.py"

sub_help(){
echo "Usage: $ProgName <subcommand> [options]\n"
echo "Subcommands:"
echo " init initialize ZFS-backed aliBuild directory"
echo " list list available tags on this machine"
echo " tag <tagname> create new tag with the given name (default: timestamp)"
echo " sync propagate all known tags to machines in inventory"
echo ""
echo "For help with each subcommand run:"
echo "$ProgName <subcommand> -h|--help"
echo ""
}

sync_host(){
HOST="$1"
$SYNC_CMD --transport "$TRANSPORT" --sshIdentity "/root/.ssh/id_rsa_aliZsync" "$DATASET_BINARIES" "ssh://root@$HOST:alizsw/sw/slc7_x86-64"
echo -e "\t\t$HOST\tbinaries synchronized"
$SYNC_CMD --transport "$TRANSPORT" --sshIdentity "/root/.ssh/id_rsa_aliZsync" "$DATASET_MODULES" "ssh://root@$HOST:alizsw/sw/MODULES"
echo -e "\t\t$HOST\tmodules synchronized"
}

sub_build() {
aliBuild build --defaults o2-dataflow "$1" --remote-store="https://s3.cern.ch/swift/v1/alibuild-repo"
}

sub_sync(){
echo "synchronizing tags"
source "$SCRIPT_DIR/aliZsync-job_pool.sh"

job_pool_init $N_WORKERS 0 # 0="no echo from commands"
while read LINE; do
[[ $LINE =~ ^#.* ]] && continue
[[ $LINE =~ ^\[.* ]] && continue

# We only get the first word from the inventory line
HOST=$(echo "$LINE" | awk '{ print $1}')
echo -e "\t$HOST"
job_pool_run sync_host "$LINE"
done < "$INVENTORY_FILE"
job_pool_shutdown
}

sub_list(){
zfs list -t snapshot -r aliZsync/sw/slc7_x86-64
}

sub_init(){
if [ "$(ls -A $PWD)" ]; then
echo "cannot initialize in non-empty directory"
exit 1
fi

SSH_KEYFILE="$HOME/.ssh/id_rsa_aliZsync"
if [ ! -f "$SSH_KEYFILE" ]; then
echo "creating ssh key file"
mkdir -p "$HOME/.ssh"
ssh-keygen -t rsa -b 4096 -C "aliZsync@`uname -n`" -f "$HOME/.ssh/id_rsa_aliZsync" -q -N ""

echo -n "Enter password for user root on inventory machines:"
read -s password
echo

echo "setting up passwordless authentication"
while read LINE; do
[[ $LINE =~ ^#.* ]] && continue
[[ $LINE =~ ^\[.* ]] && continue

# We only get the first word from the inventory line
HOST=$(echo "$LINE" | awk '{ print $1}')
echo -e "\t$HOST"
sshpass -p "$password" ssh-copy-id -i /root/.ssh/id_rsa_aliZsync "root@$HOST"
ssh -n -i /root/.ssh/id_rsa_aliZsync "root@$HOST" "firewall-cmd --zone=public --permanent --add-port=47099/tcp"
ssh -n -i /root/.ssh/id_rsa_aliZsync "root@$HOST" "firewall-cmd --reload"
done < "$INVENTORY_FILE"
fi

echo "creating sparse file at $IMG_FILE_PATH with size $POOLSIZE"
truncate -s "$POOLSIZE" "$IMG_FILE_PATH"
echo "creating ZFS pool"
sudo zpool create -m "$PWD" "$POOLNAME" "$IMG_FILE_PATH"
cd .

# no need to import after create: sudo zpool import -d $HOME -a

echo "creating datasets"
zfs create $DATASET_SW
echo -e "\t$DATASET_SW"
zfs create $DATASET_BINARIES
echo -e "\t$DATASET_BINARIES"
zfs create $DATASET_MODULES
echo -e "\t$DATASET_MODULES"
aliBuild init "${@:2}"
}

sub_mount(){
IMG_DIR="$(basename "$(dirname "$IMG_FILE_PATH")")"
sudo zpool import -d "$IMG_DIR" -a
}

sub_tag(){
echo "refreshing modules directory"
alienv q > /dev/null

SNAPSHOT_INFIX=".zfs/snapshot/$TAG_NAME"
# we must set the BASEDIR for all env modules that end up in (defaults):
# /opt/alizsw/slc7_x86-64/MODULES/<snapshot infix>/slc7_x86-64
# to:
# /opt/alizsw/slc7_x86-64/<snapshot infix>/
echo "setting target root to $TARGET_ROOT"
sed -i "s|^setenv BASEDIR.*|setenv BASEDIR $TARGET_ROOT/sw/slc7_x86-64/$SNAPSHOT_INFIX|g" sw/MODULES/slc7_x86-64/BASE/1.0
echo "creating snapshots"
zfs snapshot $DATASET_BINARIES@$TAG_NAME
echo -e "\t$DATASET_BINARIES@$TAG_NAME"
zfs snapshot $DATASET_MODULES@$TAG_NAME
echo -e "\t$DATASET_MODULES@$TAG_NAME"
}

subcommand=$1
case $subcommand in
"" | "-h" | "--help")
sub_help
;;
*)
shift
sub_${subcommand} $@
if [ $? = 127 ]; then
echo "Error: '$subcommand' is not a known subcommand." >&2
echo " Run '$ProgName --help' for a list of known subcommands." >&2
exit 1
fi
;;
esac
206 changes: 206 additions & 0 deletions hacking/aliZsync-job_pool.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
# Job pooling for bash shell scripts
# This script provides a job pooling functionality where you can keep up to n
# processes/functions running in parallel so that you don't saturate a system
# with concurrent processes.
#
# Got inspiration from http://stackoverflow.com/questions/6441509/how-to-write-a-process-pool-bash-shell
#
# Copyright (c) 2012 Vince Tse
# with changes by Geoff Clements (c) 2014
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

# end-of-jobs marker
job_pool_end_of_jobs="JOBPOOL_END_OF_JOBS"

# job queue used to send jobs to the workers
job_pool_job_queue=/tmp/job_pool_job_queue_$$

# where to run results to
job_pool_result_log=/tmp/job_pool_result_log_$$

# toggle command echoing
job_pool_echo_command=0

# number of parallel jobs allowed. also used to determine if job_pool_init
# has been called when jobs are queued.
job_pool_pool_size=-1

# \brief variable to check for number of non-zero exits
job_pool_nerrors=0

################################################################################
# private functions
################################################################################

# \brief debug output
function _job_pool_echo()
{
if [[ "${job_pool_echo_command}" == "1" ]]; then
echo $@
fi
}

# \brief cleans up
function _job_pool_cleanup()
{
rm -f ${job_pool_job_queue} ${job_pool_result_log}
}

# \brief signal handler
function _job_pool_exit_handler()
{
_job_pool_stop_workers
_job_pool_cleanup
}

# \brief print the exit codes for each command
# \param[in] result_log the file where the exit codes are written to
function _job_pool_print_result_log()
{
job_pool_nerrors=$(grep ^ERROR "${job_pool_result_log}" | wc -l)
cat "${job_pool_result_log}" | sed -e 's/^ERROR//'
}

# \brief the worker function that is called when we fork off worker processes
# \param[in] id the worker ID
# \param[in] job_queue the fifo to read jobs from
# \param[in] result_log the temporary log file to write exit codes to
function _job_pool_worker()
{
local id=$1
local job_queue=$2
local result_log=$3
local cmd=
local args=

exec 7<> ${job_queue}
while [[ "${cmd}" != "${job_pool_end_of_jobs}" && -e "${job_queue}" ]]; do
# workers block on the exclusive lock to read the job queue
flock --exclusive 7
IFS=$'\v'
read cmd args <${job_queue}
set -- ${args}
unset IFS
flock --unlock 7
# the worker should exit if it sees the end-of-job marker or run the
# job otherwise and save its exit code to the result log.
if [[ "${cmd}" == "${job_pool_end_of_jobs}" ]]; then
# write it one more time for the next sibling so that everyone
# will know we are exiting.
echo "${cmd}" >&7
else
_job_pool_echo "### _job_pool_worker-${id}: ${cmd}"
# run the job
{ ${cmd} "$@" ; }
# now check the exit code and prepend "ERROR" to the result log entry
# which we will use to count errors and then strip out later.
local result=$?
local status=
if [[ "${result}" != "0" ]]; then
status=ERROR
fi
# now write the error to the log, making sure multiple processes
# don't trample over each other.
exec 8<> ${result_log}
flock --exclusive 8
_job_pool_echo "${status}job_pool: exited ${result}: ${cmd} $@" >> ${result_log}
flock --unlock 8
exec 8>&-
_job_pool_echo "### _job_pool_worker-${id}: exited ${result}: ${cmd} $@"
fi
done
exec 7>&-
}

# \brief sends message to worker processes to stop
function _job_pool_stop_workers()
{
# send message to workers to exit, and wait for them to stop before
# doing cleanup.
echo ${job_pool_end_of_jobs} >> ${job_pool_job_queue}
wait
}

# \brief fork off the workers
# \param[in] job_queue the fifo used to send jobs to the workers
# \param[in] result_log the temporary log file to write exit codes to
function _job_pool_start_workers()
{
local job_queue=$1
local result_log=$2
for ((i=0; i<${job_pool_pool_size}; i++)); do
_job_pool_worker ${i} ${job_queue} ${result_log} &
done
}

################################################################################
# public functions
################################################################################

# \brief initializes the job pool
# \param[in] pool_size number of parallel jobs allowed
# \param[in] echo_command 1 to turn on echo, 0 to turn off
function job_pool_init()
{
local pool_size=$1
local echo_command=$2

# set the global attibutes
job_pool_pool_size=${pool_size:=1}
job_pool_echo_command=${echo_command:=0}

# create the fifo job queue and create the exit code log
rm -rf ${job_pool_job_queue} ${job_pool_result_log}
mkfifo ${job_pool_job_queue}
touch ${job_pool_result_log}

# fork off the workers
_job_pool_start_workers ${job_pool_job_queue} ${job_pool_result_log}
}

# \brief waits for all queued up jobs to complete and shuts down the job pool
function job_pool_shutdown()
{
_job_pool_stop_workers
_job_pool_print_result_log
_job_pool_cleanup
}

# \brief run a job in the job pool
function job_pool_run()
{
if [[ "${job_pool_pool_size}" == "-1" ]]; then
job_pool_init
fi
printf "%s\v" "$@" >> ${job_pool_job_queue}
echo >> ${job_pool_job_queue}
}

# \brief waits for all queued up jobs to complete before starting new jobs
# This function actually fakes a wait by telling the workers to exit
# when done with the jobs and then restarting them.
function job_pool_wait()
{
_job_pool_stop_workers
_job_pool_start_workers ${job_pool_job_queue} ${job_pool_result_log}
}
#########################################
# End of Job Pool
#########################################
Loading