-
Notifications
You must be signed in to change notification settings - Fork 19
Closed
Labels
documentationImprovements or additions to documentationImprovements or additions to documentation
Description
Introduction
The following steps below are needed on the cluster configuration steps:
- Python package setup - requirements.txt to be provided to build image
- Register runner service files
- Add application
- Start up Scaler worker
Python package setup
This should be done by user and assume user knows how to build docker/podman images and install all necessary python packages required
Register runner service files
A service is package of code that need be sent to IBM spectrum Symphony
below are the content of pickle_runner.py
import soamapi
import array
class Message(soamapi.Message):
def __init__(self, payload: bytes = b""):
self.__payload = payload
def set_payload(self, payload: bytes):
self.__payload = payload
def get_payload(self) -> bytes:
return self.__payload
def on_serialize(self, stream):
payload_array = array.array("b", self.get_payload())
stream.write_byte_array(payload_array, 0, len(payload_array))
def on_deserialize(self, stream):
self.set_payload(stream.read_byte_array("b"))
class ServiceContainer(soamapi.ServiceContainer):
def on_create_service(self, service_context):
return
def on_session_enter(self, session_context):
return
def on_invoke(self, task_context):
input_message = Message()
task_context.populate_task_input(input_message)
fn, *args = cloudpickle.loads(input_message.get_payload())
output_payload = cloudpickle.dumps(fn(*args))
output_message = Message(output_payload)
task_context.set_task_output(output_message)
def on_session_leave(self):
return
def on_destroy_service(self):
return
container = ServiceContainer()
container.run()package the code and deploy it as a service
tar -zcvf PIckleRunnerService.tar.gz pickle_runner.py
soamdeploy add PickleRunnerService -p PickleRunnerService.tar.gz -c /PickleRunner -fYou should be able to see the service now
soamdeploy view -c /PickleRunnerAdd the application
Create the application definition changing startCmd command to point to a venv of your choice
PickleRunner.xml
<?xml version="1.0" encoding="UTF-8" standalone="no" ?><Profile xmlns="http://www.platform.com/Symphony/Profile/Application" version="7.3.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<Consumer applicationName="PickleRunner" consumerId="/PickleRunner" numOfSlotsForPreloadedServices="1" preStartApplication="false" resReq="" resourceGroupName="ComputeHosts" taskHighWaterMark="1.0" taskLowWaterMark="1.0"/>
<SOAM version="7.3.2">
<SSM resReq="" shutDownTimeout="300" startUpTimeout="60" workDir="${EGO_SHARED_TOP}/soam/work">
<boundaryManagerConfig>
<boundaries>
<boundary elementName="AvailableMemory">
<event name="BEV_PROACTIVE" value="50"/>
<event name="BEV_SEVERE" value="40"/>
<event name="BEV_CRITICAL" value="0"/>
<event name="BEV_HALT" value="0"/>
</boundary>
<boundary elementName="AvailableVirtualAddressSpace">
<event name="BEV_PROACTIVE" value="50"/>
<event name="BEV_SEVERE" value="40"/>
<event name="BEV_CRITICAL" value="25"/>
<event name="BEV_HALT" value="15"/>
</boundary>
</boundaries>
</boundaryManagerConfig>
</SSM>
<SIM blockHostOnTimeout="true" blockHostOnVersionMismatch="true" startUpTimeout="120"/>
<DataHistory fileSwitchSize="100" lastingPeriod="96"/>
<PagingTasksInput blockSize="4096" diskSpace="4294967296"/>
<PagingTasksOutput blockSize="4096" diskSpace="4294967296"/>
<PagingCommonData blockSize="102400" diskSpace="8589934592"/>
<PagingCommonDataUpdates blockSize="102400" diskSpace="8589934592"/>
</SOAM>
<SessionTypes>
<Type abortSessionIfClientDisconnect="true" abortSessionIfTaskFail="false" name="RecoverableAllHistoricalData" persistTaskHistory="all" priority="1" recoverable="true" sessionRetryLimit="3" suspendGracePeriod="100" taskCleanupPeriod="100" taskRetryLimit="1"/>
<Type abortSessionIfClientDisconnect="true" abortSessionIfTaskFail="false" name="RecoverableNoHistoricalData" persistTaskHistory="none" priority="1" recoverable="true" sessionRetryLimit="3" suspendGracePeriod="100" taskCleanupPeriod="100" taskRetryLimit="1"/>
<Type abortSessionIfClientDisconnect="true" abortSessionIfTaskFail="false" name="UnrecoverableAllHistoricalData" persistTaskHistory="all" priority="1" recoverable="false" sessionRetryLimit="3" suspendGracePeriod="100" taskCleanupPeriod="100" taskRetryLimit="1"/>
<Type abortSessionIfClientDisconnect="true" abortSessionIfTaskFail="false" name="UnrecoverableNoHistoricalData" persistTaskHistory="none" priority="1" recoverable="false" sessionRetryLimit="3" suspendGracePeriod="100" taskCleanupPeriod="100" taskRetryLimit="1"/>
</SessionTypes>
<Service description="Pickle Runner Service" name="PickleRunnerService" packageName="PickleRunnerService">
<osTypes>
<osType name="all" startCmd="/opt/ibm/spectrumcomputing/symphonyde/de732/7.3.2/samples/Python/PickleRunner/venv/bin/python3.8 ${SOAM_DEPLOY_DIR}/pickle_runner.py" workDir="${SOAM_HOME}/work">
<env name="LD_LIBRARY_PATH">${SOAM_HOME}/${VERSION_NUM}/${EGO_MACHINE_TYPE}/lib64</env>
<env name="PYTHONPATH">${SOAM_DEPLOY_DIR}:${SOAM_HOME}/${VERSION_NUM}/${EGO_MACHINE_TYPE}/lib64/pythonapi_3.8.0:${SOAM_HOME}/${VERSION_NUM}/${EGO_MACHINE_TYPE}/lib64</env>
</osType>
</osTypes>
<Control>
<Method name="Register">
<Timeout actionOnSI="blockHost" duration="60"/>
<Exit actionOnSI="blockHost"/>
</Method>
<Method name="CreateService">
<Timeout actionOnSI="blockHost" duration="0"/>
<Exit actionOnSI="blockHost"/>
<Return actionOnSI="keepAlive" controlCode="0"/>
<Exception actionOnSI="blockHost" controlCode="0" type="failure"/>
<Exception actionOnSI="blockHost" controlCode="0" type="fatal"/>
</Method>
<Method name="SessionEnter">
<Timeout actionOnSI="blockHost" actionOnWorkload="retry" duration="0"/>
<Exit actionOnSI="blockHost" actionOnWorkload="retry"/>
<Return actionOnSI="keepAlive" actionOnWorkload="succeed" controlCode="0"/>
<Exception actionOnSI="keepAlive" actionOnWorkload="retry" controlCode="0" type="failure"/>
<Exception actionOnSI="keepAlive" actionOnWorkload="fail" controlCode="0" type="fatal"/>
</Method>
<Method name="SessionUpdate">
<Timeout actionOnSI="blockHost" actionOnWorkload="retry" duration="0"/>
<Exit actionOnSI="blockHost" actionOnWorkload="retry"/>
<Return actionOnSI="keepAlive" actionOnWorkload="succeed" controlCode="0"/>
<Exception actionOnSI="keepAlive" actionOnWorkload="retry" controlCode="0" type="failure"/>
<Exception actionOnSI="keepAlive" actionOnWorkload="fail" controlCode="0" type="fatal"/>
</Method>
<Method name="Invoke">
<Timeout actionOnSI="restartService" actionOnWorkload="retry" duration="0"/>
<Exit actionOnSI="restartService" actionOnWorkload="retry"/>
<Return actionOnSI="keepAlive" actionOnWorkload="succeed" controlCode="0"/>
<Return actionOnSI="keepAlive" actionOnWorkload="fail" controlCode="5"/>
<Exception actionOnSI="keepAlive" actionOnWorkload="retry" controlCode="0" type="failure"/>
<Exception actionOnSI="keepAlive" actionOnWorkload="fail" controlCode="0" type="fatal"/>
</Method>
<Method name="SessionLeave">
<Timeout actionOnSI="restartService" duration="0"/>
<Exit actionOnSI="restartService"/>
<Return actionOnSI="keepAlive" controlCode="0"/>
<Exception actionOnSI="keepAlive" controlCode="0" type="failure"/>
<Exception actionOnSI="keepAlive" controlCode="0" type="fatal"/>
</Method>
<Method name="DestroyService">
<Timeout duration="15"/>
</Method>
</Control>
</Service>
</Profile>Register the application
soamreg PickleRunner.xmlStart up Scaler
First you should check how many nodes are in the Symphony cluster using symping
symping
pip install scaler>=1.9.0
scaler_symphony_cluster tcp://127.0.0.1:2345 PickleRunner --base-concurrency 4Now you should be able to submit workloads
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
documentationImprovements or additions to documentationImprovements or additions to documentation