diff --git a/http/src/main/resources/init-resources/azure_vm_init_script.sh b/http/src/main/resources/init-resources/azure_vm_init_script.sh index 709cfb17c2a..a64bead972a 100644 --- a/http/src/main/resources/init-resources/azure_vm_init_script.sh +++ b/http/src/main/resources/init-resources/azure_vm_init_script.sh @@ -8,93 +8,16 @@ set -e # 'debconf: unable to initialize frontend: Dialog' export DEBIAN_FRONTEND=noninteractive -#create user to run jupyter -VM_JUP_USER=jupyter +##### JUPYTER USER SETUP ##### +# Create the jupyter user that corresponds to the jupyter user in the jupyter container +VM_JUP_USER=jupyter-user +VM_JUP_USER_UID=1002 -sudo useradd -m -c "Jupyter User" $VM_JUP_USER +sudo useradd -m -c "Jupyter User" -u $VM_JUP_USER_UID $VM_JUP_USER sudo usermod -a -G $VM_JUP_USER,adm,dialout,cdrom,floppy,audio,dip,video,plugdev,lxd,netdev $VM_JUP_USER -## Change ownership for the new user - -sudo chgrp $VM_JUP_USER /anaconda/bin/* - -sudo chown $VM_JUP_USER /anaconda/bin/* - -sudo chgrp $VM_JUP_USER /anaconda/envs/py38_default/bin/* - -sudo chown $VM_JUP_USER /anaconda/envs/py38_default/bin/* - -sudo systemctl disable --now jupyterhub.service - - -# Formatting and mounting persistent disk -WORK_DIRECTORY="/home/$VM_JUP_USER/persistent_disk" -## Create the PD working directory -mkdir -p ${WORK_DIRECTORY} - -## The PD should be the only `sd` disk that is not mounted yet -AllsdDisks=($(lsblk --nodeps --noheadings --output NAME --paths | grep -i "sd")) -FreesdDisks=() -for Disk in "${AllsdDisks[@]}"; do - Mounts="$(lsblk -no MOUNTPOINT "${Disk}")" - if [ -z "$Mounts" ]; then - echo "Found our unmounted persistent disk!" - FreesdDisks="${Disk}" - else - echo "Not our persistent disk!" - fi -done -DISK_DEVICE_PATH=${FreesdDisks} - -## Only format disk is it hasn't already been formatted -## It the disk has previously been in use, then it should have a partition that we can mount -EXIT_CODE=0 -lsblk -no NAME --paths "${DISK_DEVICE_PATH}1" || EXIT_CODE=$? -if [ $EXIT_CODE -eq 0 ]; then - ## From https://learn.microsoft.com/en-us/azure/virtual-machines/linux/attach-disk-portal?tabs=ubuntu - ## Use the partprobe utility to make sure the kernel is aware of the new partition and filesystem. - ## Failure to use partprobe can cause the blkid or lslbk commands to not return the UUID for the new filesystem immediately. - sudo partprobe "${DISK_DEVICE_PATH}1" - # There is a pre-existing partition that we should try to directly mount - sudo mount -t ext4 "${DISK_DEVICE_PATH}1" ${WORK_DIRECTORY} - echo "Existing PD successfully remounted" -else - ## Create one partition on the PD - ( - echo o #create a new empty DOS partition table - echo n #add a new partition - echo p #print the partition table - echo - echo - echo - echo w #write table to disk and exit - ) | sudo fdisk ${DISK_DEVICE_PATH} - echo "successful partitioning" - ## Format the partition - # It's likely that the persistent disk was previously mounted on another VM and wasn't properly unmounted - # Passing -F -F to mkfs ext4 forces the tool to ignore the state of the partition. - # Note that there should be two instances command-line switch (-F -F) to override this check - echo y | sudo mkfs.ext4 "${DISK_DEVICE_PATH}1" -F -F - echo "successful formatting" - ## From https://learn.microsoft.com/en-us/azure/virtual-machines/linux/attach-disk-portal?tabs=ubuntu - ## Use the partprobe utility to make sure the kernel is aware of the new partition and filesystem. - ## Failure to use partprobe can cause the blkid or lslbk commands to not return the UUID for the new filesystem immediately. - sudo partprobe "${DISK_DEVICE_PATH}1" - ## Mount the PD partition to the working directory - sudo mount -t ext4 "${DISK_DEVICE_PATH}1" ${WORK_DIRECTORY} - echo "successful mount" -fi - -## Add the PD UUID to fstab to ensure that the drive is remounted automatically after a reboot -OUTPUT="$(lsblk -no UUID --paths "${DISK_DEVICE_PATH}1")" -echo "UUID="$OUTPUT" ${WORK_DIRECTORY} ext4 defaults 0 1" | sudo tee -a /etc/fstab -echo "successful write of PD UUID to fstab" - -## Change ownership of the mounted drive to the user -sudo chown -R $VM_JUP_USER:$VM_JUP_USER ${WORK_DIRECTORY} - - -# Read script arguments +##### READ SCRIPT ARGUMENT ##### +# These are passed in setupCreateVmCreateMessage in the AzurePubsub Handler echo $# arguments if [ $# -ne 13 ]; then echo "illegal number of parameters" @@ -119,14 +42,23 @@ WELDER_STAGING_BUCKET="${14:-dummy}" WELDER_STAGING_STORAGE_CONTAINER_RESOURCE_ID="${15:-dummy}" # Envs for Jupyter +JUPYTER_DOCKER_IMAGE="terradevacrpublic.azurecr.io/jupyter-server:test" +# NOTEBOOKS_DIR corresponds to the location INSIDE the jupyter docker container, +# and is not to be used withing the context of the DSVM itself +NOTEBOOKS_DIR="/home/$VM_JUP_USER/persistent_disk" WORKSPACE_NAME="${16:-dummy}" WORKSPACE_STORAGE_CONTAINER_URL="${17:-dummy}" # Jupyter variables for listener SERVER_APP_BASE_URL="/${RELAY_CONNECTION_NAME}/" SERVER_APP_ALLOW_ORIGIN="*" -HCVAR='\$hc' +# We need to escape this $ character twice, once for the docker exec arg, and another time for passing it to run-jupyter.sh +HCVAR='\\\$hc' SERVER_APP_WEBSOCKET_URL="wss://${RELAY_NAME}.servicebus.windows.net/${HCVAR}/${RELAY_CONNECTION_NAME}" +# We need to escape this $ character one extra time to pass it to the crontab for rebooting. The use of $hc in the websocket URL is +# something that we should rethink as it creates a lot of complexity downstream +REBOOT_HCVAR='\\\\\\\$hc' +REBOOT_SERVER_APP_WEBSOCKET_URL="wss://${RELAY_NAME}.servicebus.windows.net/${REBOOT_HCVAR}/${RELAY_CONNECTION_NAME}" SERVER_APP_WEBSOCKET_HOST="${RELAY_NAME}.servicebus.windows.net" # Relay listener configuration @@ -168,57 +100,115 @@ echo "RUNTIME_NAME = ${RUNTIME_NAME}" echo "VALID_HOSTS = ${VALID_HOSTS}" echo "R-VERSION = ${R_VERSION}" -# Wait for lock to resolve before any installs, to resolve this error: https://broadworkbench.atlassian.net/browse/IA-4645 - -while sudo fuser /var/lib/dpkg/lock-frontend > /dev/null 2>&1 - do - echo "Waiting to get lock /var/lib/dpkg/lock-frontend..." - sleep 5 - done - -# Install updated R version -echo "Installing R version ${R_VERSION}" -# Add the CRAN repository to the sources list -echo "deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/" | sudo tee /etc/apt/sources.list -a -# Update package list -sudo apt-get update -# Install new R version -sudo apt-get install --no-install-recommends -y r-base=${R_VERSION} - -#Update kernel list - -echo "Y"| /anaconda/bin/jupyter kernelspec remove sparkkernel - -echo "Y"| /anaconda/bin/jupyter kernelspec remove sparkrkernel - -echo "Y"| /anaconda/bin/jupyter kernelspec remove pysparkkernel - -echo "Y"| /anaconda/bin/jupyter kernelspec remove spark-3-python +##### Persistent Disk (PD) MOUNTING ##### +# Formatting and mounting persistent disk +# Note that we cannot mount in /mnt/disks/work as it is a temporary disk on the DSVM! +PD_DIRECTORY="/home/$VM_JUP_USER/persistent_disk" +## Create the working and persistent disk directories +mkdir -p ${PD_DIRECTORY} -#echo "Y"| /anaconda/bin/jupyter kernelspec remove julia-1.6 +## The PD should be the only `sd` disk that is not mounted yet +AllsdDisks=($(lsblk --nodeps --noheadings --output NAME --paths | grep -i "sd")) +FreesdDisks=() +for Disk in "${AllsdDisks[@]}"; do + Mounts="$(lsblk -no MOUNTPOINT "${Disk}")" + if [ -z "$Mounts" ]; then + echo "Found our unmounted persistent disk!" + FreesdDisks="${Disk}" + else + echo "Not our persistent disk!" + fi +done +DISK_DEVICE_PATH=${FreesdDisks} -echo "Y"| /anaconda/envs/py38_default/bin/pip3 install ipykernel pydevd +## Only format disk is it hasn't already been formatted +## It the disk has previously been in use, then it should have a partition that we can mount +EXIT_CODE=0 +lsblk -no NAME --paths "${DISK_DEVICE_PATH}1" || EXIT_CODE=$? +if [ $EXIT_CODE -eq 0 ]; then + ## From https://learn.microsoft.com/en-us/azure/virtual-machines/linux/attach-disk-portal?tabs=ubuntu + ## Use the partprobe utility to make sure the kernel is aware of the new partition and filesystem. + ## Failure to use partprobe can cause the blkid or lslbk commands to not return the UUID for the new filesystem immediately. + sudo partprobe "${DISK_DEVICE_PATH}1" + # There is a pre-existing partition that we should try to directly mount + sudo mount -t ext4 "${DISK_DEVICE_PATH}1" ${PD_DIRECTORY} + echo "Existing PD successfully remounted" +else + ## Create one partition on the PD + ( + echo o #create a new empty DOS partition table + echo n #add a new partition + echo p #print the partition table + echo + echo + echo + echo w #write table to disk and exit + ) | sudo fdisk ${DISK_DEVICE_PATH} + echo "successful partitioning" + ## Format the partition + echo y | sudo mkfs -t ext4 "${DISK_DEVICE_PATH}1" + echo "successful formatting" + ## From https://learn.microsoft.com/en-us/azure/virtual-machines/linux/attach-disk-portal?tabs=ubuntu + ## Use the partprobe utility to make sure the kernel is aware of the new partition and filesystem. + ## Failure to use partprobe can cause the blkid or lslbk commands to not return the UUID for the new filesystem immediately. + sudo partprobe "${DISK_DEVICE_PATH}1" + ## Mount the PD partition to the working directory + sudo mount -t ext4 "${DISK_DEVICE_PATH}1" ${PD_DIRECTORY} + echo "successful mount" +fi -echo "Y"| /anaconda/envs/py38_default/bin/python3 -m ipykernel install +## Add the PD UUID to fstab to ensure that the drive is remounted automatically after a reboot +OUTPUT="$(lsblk -no UUID --paths "${DISK_DEVICE_PATH}1")" +echo "UUID="$OUTPUT" ${PD_DIRECTORY} ext4 defaults 0 1" | sudo tee -a /etc/fstab +echo "successful write of PD UUID to fstab" -# Start Jupyter server with custom parameters -sudo runuser -l $VM_JUP_USER -c "mkdir -p /home/$VM_JUP_USER/.jupyter" -sudo runuser -l $VM_JUP_USER -c "wget -qP /home/$VM_JUP_USER/.jupyter https://raw.githubusercontent.com/DataBiosphere/leonardo/ea519ef899de28e27e2a37ba368433da9fd03b7f/http/src/main/resources/init-resources/jupyter_server_config.py" -# We pull the jupyter_delocalize.py file from the base terra-docker python image, but it was designed for notebooks and we need to make a couple of changes to make it work with server instead -sudo runuser -l $VM_JUP_USER -c "wget -qP /anaconda/lib/python3.10/site-packages https://raw.githubusercontent.com/DataBiosphere/terra-docker/0ea6d2ebd7fcae7072e01e1c2f2d178390a276b0/terra-jupyter-base/custom/jupyter_delocalize.py" -sudo runuser -l $VM_JUP_USER -c "sed -i 's/notebook.services/jupyter_server.services/g' /anaconda/lib/python3.10/site-packages/jupyter_delocalize.py" -sudo runuser -l $VM_JUP_USER -c "sed -i 's/http:\/\/welder:8080/http:\/\/127.0.0.1:8081/g' /anaconda/lib/python3.10/site-packages/jupyter_delocalize.py" +## Make sure that both the jupyter and welder users have access to the persistent disk on the VM +## This needs to happen before we start up containers +sudo chmod a+rwx ${PD_DIRECTORY} -echo "------ Jupyter ------" +##### JUPYTER SERVER ##### +echo "------ Jupyter version: ${JUPYTER_DOCKER_IMAGE} ------" echo "Starting Jupyter with command..." -echo "sudo runuser -l $VM_JUP_USER -c \"/anaconda/bin/jupyter server --ServerApp.base_url=$SERVER_APP_BASE_URL --ServerApp.websocket_url=$SERVER_APP_WEBSOCKET_URL --ServerApp.contents_manager_class=jupyter_delocalize.WelderContentsManager --autoreload &> /home/$VM_JUP_USER/jupyter.log\"" >/dev/null 2>&1& +echo "docker run -d --gpus all --restart always --network host --name jupyter \ +--entrypoint tail \ +--volume ${PD_DIRECTORY}:${NOTEBOOKS_DIR} \ +-e CLOUD_PROVIDER=Azure \ +-e WORKSPACE_ID=$WORKSPACE_ID \ +-e WORKSPACE_NAME=$WORKSPACE_NAME \ +-e WORKSPACE_STORAGE_CONTAINER_URL=$WORKSPACE_STORAGE_CONTAINER_URL \ +-e STORAGE_CONTAINER_RESOURCE_ID=$WORKSPACE_STORAGE_CONTAINER_ID \ +$JUPYTER_DOCKER_IMAGE \ +-f /dev/null" + +#Run docker container with Jupyter Server +#Override entrypoint with a placeholder (tail -f /dev/null) to keep the container running indefinitely. +#The jupyter server itself will be started via docker exec after. +#Mount the persistent disk directory to the jupyter notebook home directory +docker run -d --gpus all --restart always --network host --name jupyter \ +--entrypoint tail \ +--volume ${PD_DIRECTORY}:${NOTEBOOKS_DIR} \ +--env CLOUD_PROVIDER=Azure \ +--env WORKSPACE_ID=$WORKSPACE_ID \ +--env WORKSPACE_NAME=$WORKSPACE_NAME \ +--env WORKSPACE_STORAGE_CONTAINER_URL=$WORKSPACE_STORAGE_CONTAINER_URL \ +--env STORAGE_CONTAINER_RESOURCE_ID=$WORKSPACE_STORAGE_CONTAINER_ID \ +$JUPYTER_DOCKER_IMAGE \ +-f /dev/null + +echo 'Starting Jupyter Notebook...' +echo "docker exec -d jupyter /bin/bash -c '/usr/jupytervenv/run-jupyter.sh ${SERVER_APP_BASE_URL} ${SERVER_APP_WEBSOCKET_URL} ${NOTEBOOKS_DIR}'" +docker exec -d jupyter /bin/bash -c "/usr/jupytervenv/run-jupyter.sh ${SERVER_APP_BASE_URL} ${SERVER_APP_WEBSOCKET_URL} ${NOTEBOOKS_DIR}" -sudo runuser -l $VM_JUP_USER -c "/anaconda/bin/jupyter server --ServerApp.base_url=$SERVER_APP_BASE_URL --ServerApp.websocket_url=$SERVER_APP_WEBSOCKET_URL --ServerApp.contents_manager_class=jupyter_delocalize.WelderContentsManager --autoreload &> /home/$VM_JUP_USER/jupyter.log" >/dev/null 2>&1& +# Store Jupyter Server Docker exec command for reboot processes +# Cron does not play well with escaping backlashes so it is safer to run a script instead of the docker command directly +echo "docker exec -d jupyter /bin/bash -c '/usr/jupytervenv/run-jupyter.sh ${SERVER_APP_BASE_URL} ${REBOOT_SERVER_APP_WEBSOCKET_URL} ${NOTEBOOKS_DIR}'" | sudo tee /home/reboot_script.sh +sudo chmod +x /home/reboot_script.sh +sudo crontab -l 2>/dev/null| cat - <(echo "@reboot /home/reboot_script.sh") | crontab - -# Store Jupyter Server parameters for reboot processes -sudo crontab -l 2>/dev/null| cat - <(echo "@reboot sudo runuser -l $VM_JUP_USER -c '/anaconda/bin/jupyter server --ServerApp.base_url=$SERVER_APP_BASE_URL --ServerApp.websocket_url=$SERVER_APP_WEBSOCKET_URL --ServerApp.contents_manager_class=jupyter_delocalize.WelderContentsManager --autoreload &> /home/$VM_JUP_USER/jupyter.log' >/dev/null 2>&1&") | crontab - +echo "------ Jupyter done ------" +##### LISTENER ##### echo "------ Listener version: ${LISTENER_DOCKER_IMAGE} ------" echo " Starting listener with command..." @@ -265,11 +255,12 @@ $LISTENER_DOCKER_IMAGE echo "------ Listener done ------" +##### WELDER ##### echo "------ Welder version: ${WELDER_WELDER_DOCKER_IMAGE} ------" echo " Starting Welder with command...." echo "docker run -d --restart always --network host --name welder \ - --volume \"/home/${VM_JUP_USER}\":\"/work\" \ + --volume "${PD_DIRECTORY}:/work" \ -e WSM_URL=$WELDER_WSM_URL \ -e PORT=8081 \ -e WORKSPACE_ID=$WORKSPACE_ID \ @@ -283,7 +274,7 @@ echo "docker run -d --restart always --network host --name welder \ $WELDER_WELDER_DOCKER_IMAGE" docker run -d --restart always --network host --name welder \ ---volume "/home/${VM_JUP_USER}":"/work" \ +--volume "${PD_DIRECTORY}:/work" \ --env WSM_URL=$WELDER_WSM_URL \ --env PORT=8081 \ --env WORKSPACE_ID=$WORKSPACE_ID \ @@ -296,19 +287,4 @@ docker run -d --restart always --network host --name welder \ --env SHOULD_BACKGROUND_SYNC="false" \ $WELDER_WELDER_DOCKER_IMAGE -echo "------ Welder done ------" - -# This next command creates a json file which contains the "env" variables to be added to the kernel.json files. -jq --null-input \ ---arg workspace_id "${WORKSPACE_ID}" \ ---arg workspace_storage_container_id "${WORKSPACE_STORAGE_CONTAINER_ID}" \ ---arg workspace_name "${WORKSPACE_NAME}" \ ---arg workspace_storage_container_url "${WORKSPACE_STORAGE_CONTAINER_URL}" \ -'{ "env": { "WORKSPACE_ID": $workspace_id, "WORKSPACE_STORAGE_CONTAINER_ID": $workspace_storage_container_id, "WORKSPACE_NAME": $workspace_name, "WORKSPACE_STORAGE_CONTAINER_URL": $workspace_storage_container_url }}' \ -> wsenv.json - -# This next commands iterate through the available kernels, and uses jq to include the env variables from the previous step -/anaconda/bin/jupyter kernelspec list | awk 'NR>1 {print $2}' | while read line; do jq -s add $line"/kernel.json" wsenv.json > tmpkernel.json && mv tmpkernel.json $line"/kernel.json"; done -/anaconda/envs/py38_default/bin/jupyter kernelspec list | awk 'NR>1 {print $2}' | while read line; do jq -s add $line"/kernel.json" wsenv.json > tmpkernel.json && mv tmpkernel.json $line"/kernel.json"; done -/anaconda/envs/azureml_py38/bin/jupyter kernelspec list | awk 'NR>1 {print $2}' | while read line; do jq -s add $line"/kernel.json" wsenv.json > tmpkernel.json && mv tmpkernel.json $line"/kernel.json"; done -/anaconda/envs/azureml_py38_PT_and_TF/bin/jupyter kernelspec list | awk 'NR>1 {print $2}' | while read line; do jq -s add $line"/kernel.json" wsenv.json > tmpkernel.json && mv tmpkernel.json $line"/kernel.json"; done +echo "------ Welder done ------" \ No newline at end of file diff --git a/http/src/main/resources/reference.conf b/http/src/main/resources/reference.conf index 3df2ecdebdc..839b1b3f582 100644 --- a/http/src/main/resources/reference.conf +++ b/http/src/main/resources/reference.conf @@ -251,7 +251,7 @@ azure { type = "CustomScript", version = "2.1", minor-version-auto-upgrade = true, - file-uris = ["https://raw.githubusercontent.com/DataBiosphere/leonardo/788e53e22dab4f0cee6e7b7cdbfd271a0b43450d/http/src/main/resources/init-resources/azure_vm_init_script.sh"] + file-uris = ["https://raw.githubusercontent.com/DataBiosphere/leonardo/45e370d6475106eb63242f556ab4310a78d03653/http/src/main/resources/init-resources/azure_vm_init_script.sh"] } listener-image = "terradevacrpublic.azurecr.io/terra-azure-relay-listeners:76d982c" } diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/WsmCodecSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/WsmCodecSpec.scala index 591b61ddb58..3b5a07a93dd 100644 --- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/WsmCodecSpec.scala +++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/WsmCodecSpec.scala @@ -108,7 +108,7 @@ class WsmCodecSpec extends AnyFlatSpec with Matchers { | "minorVersionAutoUpgrade": true, | "protectedSettings": [{ | "key": "fileUris", - | "value": ["https://raw.githubusercontent.com/DataBiosphere/leonardo/788e53e22dab4f0cee6e7b7cdbfd271a0b43450d/http/src/main/resources/init-resources/azure_vm_init_script.sh"] + | "value": ["https://raw.githubusercontent.com/DataBiosphere/leonardo/45e370d6475106eb63242f556ab4310a78d03653/http/src/main/resources/init-resources/azure_vm_init_script.sh"] | }, | { | "key": "commandToExecute", diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/http/ConfigReaderSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/http/ConfigReaderSpec.scala index 1218baa91f0..023b74072ba 100644 --- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/http/ConfigReaderSpec.scala +++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/http/ConfigReaderSpec.scala @@ -73,7 +73,7 @@ class ConfigReaderSpec extends AnyFlatSpec with Matchers { "2.1", true, List( - "https://raw.githubusercontent.com/DataBiosphere/leonardo/788e53e22dab4f0cee6e7b7cdbfd271a0b43450d/http/src/main/resources/init-resources/azure_vm_init_script.sh" + "https://raw.githubusercontent.com/DataBiosphere/leonardo/45e370d6475106eb63242f556ab4310a78d03653/http/src/main/resources/init-resources/azure_vm_init_script.sh" ) ), "terradevacrpublic.azurecr.io/terra-azure-relay-listeners:76d982c",