Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions bashsetup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
module load cuda/10.1.168
module load python/3.6.6-anaconda3-5.3.0
module load gcc/6.4.0
module load magma/2.4.0
module load netlib-lapack/3.8.0

export LD_LIBRARY_PATH=/ccs/proj/csc292/xsy/summit/pytorch-env/lib/:$LD_LIBRARY_PATH

#source /sw/summit/python/3.6/anaconda3/5.3.0/etc/profile.d/conda.sh
#source activate /autofs/nccs-svm1_proj/lrn002/xsy/pytorch-1.0.0-summit
29 changes: 29 additions & 0 deletions gan-script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
cd src/

PID_FILE="gan_pids"
rm -f ${PID_FILE}

for ((i=0; i<4; i++))
do
echo "Start client on GPU ${i}"
export CUDA_VISIBLE_DEVICES=${i};
python main.py train --distributed --client &
echo $! >> ${PID_FILE}
sleep 5;
done

echo "Client PIDS:"
cat ${PID_FILE}
sleep 5

# kill number of clients
bash client_failure_exp.sh 4

echo "Start master on GPU 4"
# export CUDA_VISIBLE_DEVICES=4;
python main.py train --distributed --master -f configuration/quickstart/mnist.yml

echo "Begin kill clients"
cat ${PID_FILE} | xargs -I {} kill -9 {}

echo "Done killing clients"
4 changes: 4 additions & 0 deletions get-clients.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

h=$( hostname )

printf " - address: $h\n port: 5000-5024\n"
26 changes: 26 additions & 0 deletions gpu-script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
cd src/
echo $PORT_UPPER_LIMIT
echo $NUM_CLIENTS_PER_NODE
h=$( hostname )

#sleep $JSM_NAMESPACE_RANK
printf " - address: $h<> port: 5000-$PORT_UPPER_LIMIT<>" >> configuration/quickstart/general.yml # port: 5000-$PORT_UPPER_LIMIT\n" >> configuration/quickstart/general.yml


for((i=1; i<=$NUM_CLIENTS_PER_NODE; i++))
do
GPU_ID=$(($i % 6))
export CUDA_VISIBLE_DEVICES=$GPU_ID; python main.py train --distributed --client &
sleep 5;
done


if (( $JSM_NAMESPACE_RANK == 0 )); then

sed -i 's/<>/\n/g' configuration/quickstart/general.yml

printf "\n" >> configuration/quickstart/general.yml
python main.py train --distributed --master -f configuration/quickstart/mnist-gpu.yml
fi

wait
17 changes: 17 additions & 0 deletions lipi-mnist-satori.lsf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#BSUB -L /bin/bash
#BSUB -J "lipi-mnist-satori"
#BSUB -o "lipi-mnist-satori_o.%J"
#BSUB -e "lipi-mnist-satori_e.%J"
#BSUB -n 4
#BSUB -R "span[ptile=4]"
#BSUB -gpu "num=4"
#BSUB -q "normal"

HOME2=/nobackup/users/umustafi
PYTHON_VIRTUAL_ENVIRONMENT=lipi
CONDA_ROOT=$HOME2/anaconda3
source ${CONDA_ROOT}/etc/profile.d/conda.sh
conda activate $PYTHON_VIRTUAL_ENVIRONMENT

cd $HOME2/projects/lipizzaner-gan/
bash gan-script.sh
22 changes: 22 additions & 0 deletions run1.lsf
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
# Begin BSUB Options
#BSUB -P csc292
#BSUB -W 01:00
#BSUB -nnodes 1
#BSUB -alloc_flags "gpumps"
#BSUB -J ganwork
#BSUB -o ganwork.%J
#BSUB -e ganwork.%J


cd $PROJWORK/csc292/ganwork/lipizzaner-gan
source bashsetup.sh
source deactivate
source activate /autofs/nccs-svm1_proj/csc292/xsy/summit/pytorch-env

RUNID=ganrun_000


jsrun -n 1 -g 6 -c 42 -r 1 -a 1 bash gan-script.sh
#jsrun -n 12 -a 1 -c 7 -g 1 -r 6 -l CPU-CPU -d packed -b packed:7 which python

24 changes: 24 additions & 0 deletions rungpu.lsf
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
# Begin BSUB Options
#BSUB -P csc292
#BSUB -W 02:00
#BSUB -nnodes 2
#BSUB -alloc_flags "gpumps"
#BSUB -J ganwork
#BSUB -o ganwork.%J
#BSUB -e ganwork.%J

NODES=2
export NUM_CLIENTS_PER_NODE=8
export PORT_UPPER_LIMIT=5007

cd $PROJWORK/csc292/ganwork/lipizzaner-gan
source bashsetup.sh
source deactivate
source activate /autofs/nccs-svm1_proj/csc292/xsy/summit/pytorch-env

RUNID=ganrun_000

jsrun -n $NODES -g 6 -c 42 -r 1 -a 1 bash gpu-script.sh
#jsrun -n 12 -a 1 -c 7 -g 1 -r 6 -l CPU-CPU -d packed -b packed:7 which python

8 changes: 5 additions & 3 deletions src/configuration/quickstart/general.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@ losswise:
enabled: False
api_key: # Fill in API key
output_dir: ./output
seed: 1
num_workers: 0 # how many subprocesses to use for data loading
distribution:
auto_discover: False
master_node:
exit_clients_on_disconnect: True
client_nodes:
- address: 127.0.0.1 # Fill in IP address here
- address: 127.0.0.1
port: 5000-5003
seed: 1
num_workers: 0 # how many subprocesses to use for data loading
# Non-distributed
#port: 0
17 changes: 17 additions & 0 deletions src/configuration/quickstart/general.yml.bak
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
logging:
enabled: True
log_level: INFO
log_server: # Fill in connection string with read/write access here
image_format: jpg
print_discriminator: False
losswise:
enabled: False
api_key: # Fill in API key
output_dir: ./output
seed: 1
num_workers: 0 # how many subprocesses to use for data loading
distribution:
auto_discover: False
master_node:
exit_clients_on_disconnect: True
client_nodes:
39 changes: 39 additions & 0 deletions src/configuration/quickstart/mnist-gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
trainer:
name: lipizzaner_gan
n_iterations: 100
calculate_net_weights_dist: True
# independent_probability, exact_proportion
mixture_generator_samples_mode: exact_proportion
params:
population_size: 1
tournament_size: 2
n_replacements: 1
default_adam_learning_rate: 0.0002
# Hyperparameter mutation
alpha: 0.0001
mutation_probability: 0.5
discriminator_skip_each_nth_step: 1
mixture_sigma: 0.01
enable_selection: True
score:
enabled: True
type: fid
score_sample_size: 1000
cuda: True
fitness:
fitness_sample_size: 1000
fitness_mode: average # worse, best, average
dataloader:
dataset_name: mnist
use_batch: True
batch_size: 400
n_batches: 0
shuffle: True
network:
name: four_layer_perceptron
loss: bceloss
master:
calculate_score: True
score_sample_size: 50000
cuda: True
general: !include general.yml
2 changes: 1 addition & 1 deletion src/configuration/quickstart/mnist.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ trainer:
mixture_generator_samples_mode: exact_proportion
params:
population_size: 1
tournament_size: 2
tournament_size: 1
n_replacements: 1
default_adam_learning_rate: 0.0002
# Hyperparameter mutation
Expand Down
7 changes: 7 additions & 0 deletions src/data/mnist_data_loader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from torchvision import datasets
from torchvision.transforms import transforms
from data.data_loader import DataLoader


Expand All @@ -10,3 +11,9 @@ def __init__(self, use_batch=True, batch_size=100, n_batches=0, shuffle=False):
@property
def n_input_neurons(self):
return 784

def transform(self):
return transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
4 changes: 2 additions & 2 deletions src/data/network_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
N_VALUES_PER_RECORD = 4


flow_data = np.load("./data/network_data/network_capture.npy")
#flow_data = np.load("./data/network_data/network_capture.npy")

def generate_random_sequences(num_sequences):
sequences = []
Expand Down Expand Up @@ -62,7 +62,7 @@ class NetworkDataSet(Dataset):
def __init__(self, **kwargs):
self.data = flow_data
print("Packets Array Size: ", self.data.shape)

def __getitem__(self, index):
return self.data[index]

Expand Down
1 change: 1 addition & 0 deletions src/distribution/client_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def terminate_experiment():
ClientAPI._lock.acquire()

if ClientAPI.is_busy:
# write more log messages to check if killing clients successfully
ClientAPI._logger.warning('Received stop signal from master, experiment will be quit.')
ClientAPI._stop_event.set()
else:
Expand Down
5 changes: 4 additions & 1 deletion src/distribution/concurrent_populations.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ def lock(self):
self._lock.acquire()

def unlock(self):
self._lock.release()
try:
self._lock.release()
except RuntimeError:
pass

@property
def generator(self):
Expand Down
24 changes: 24 additions & 0 deletions src/distribution/node_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,30 @@ def stop_running_experiments(self, except_for_clients=None):
address = 'http://{}:{}/experiments'.format(client['address'], client['port'])
requests.delete(address)

# will kill up to num_clients_to_kill if they exist
def kill_clients(self, num_clients_to_kill):
clients = self.cc.settings['general']['distribution']['client_nodes']
active_clients = [c for c in clients if not any(d for d in except_for_clients if d['address'] == c['address']
and d['port'] == c['port'])]
killed = 0
NodeClient._logger.info("clients are: {} \nactive clients are: {}".format(clients, active_clients))

for client in active_clients:
if killed > num_clients_to_kill:
break
address = 'http://{}:{}/experiments'.format(client['address'], client['port'])
requests.delete(address)
# attempt to do a request to this address to see if its alive
NodeClient._logger.info("deleted {}\n".format(address))
killed += 1

active_clients_after = [c for c in clients if not any(d for d in except_for_clients if d['address'] == c['address']
and d['port'] == c['port'])]

assert(len(active_clients_after) == len(active_clients_after) - killed)
NodeClient._logger.info("actually killed {} of desired killed {}, num clients is {}".format(len(active_clients_after),killed, num_clients_to_kill))


@staticmethod
def _load_parameters_async(node, path, timeout_sec):
address = 'http://{}:{}{}'.format(node['address'], node['port'], path)
Expand Down
9 changes: 9 additions & 0 deletions src/helper_files/requirements-ornl.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
torchvision==0.2.0
losswise==2.0
PyYAML>=4.1b
matplotlib==2.2.2
scipy==1.0.1
Flask==0.12.2
netaddr==0.7.19
pymongo==3.6.1
netifaces==0.10.9
2 changes: 1 addition & 1 deletion src/helpers/pytorch_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,4 @@ def calculate_net_weights_dist(net1, net2):
for net1_layer_weights, net2_layer_weights in zip(net1.parameters(), net2.parameters()):
l2_dist += torch.sum((net1_layer_weights - net2_layer_weights)**2)

return torch.sqrt(l2_dist).data.cpu().numpy()[0]
return torch.sqrt(l2_dist).data.cpu().numpy()#[0]
6 changes: 4 additions & 2 deletions src/helpers/singleton.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ def __init__(self, decorated):
def instance(self):
try:
return self._instance
except AttributeError:
except AttributeError as e:
print(e)
print("Instansiate")
self._instance = self._decorated()
return self._instance

def __call__(self):
raise TypeError('Singletons must be accessed through `instance()`.')

def __instancecheck__(self, inst):
return isinstance(inst, self._decorated)
return isinstance(inst, self._decorated)
13 changes: 12 additions & 1 deletion src/lipizzaner_master.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ def run(self):

self.heartbeat_thread.join()

# TODO set a timer here that after a certain interval will terminate X number of clients
# self._logger.info("about to call node function to kill clients")
# self._kill_clients(1)
# time.sleep(25)

# When this is reached, the heartbeat thread has stopped.
# This either happens when the experiments are done, or if they were terminated
if self.heartbeat_thread.success:
Expand All @@ -96,7 +101,9 @@ def _accessible_clients(self, clients):
assert client['address'] is not None
address = 'http://{}:{}/status'.format(client['address'], client['port'])
try:
print(f'{address}')
resp = requests.get(address)
print(f'{resp}')
assert resp.status_code == 200
assert not resp.json()['busy']
accessible_clients.append(client)
Expand Down Expand Up @@ -155,6 +162,10 @@ def _terminate(self, stop_clients=True, return_code=-1):

exit(return_code)

def _kill_clients(self, num_to_kill=0):
node_client = NodeClient(None)
node_client.kill_clients(num_to_kill)

def _gather_results(self):
self._logger.info('Collecting results from clients...')

Expand All @@ -165,7 +176,7 @@ def _gather_results(self):
db_logger = DbLogger()

results = node_client.gather_results(self.cc.settings['general']['distribution']['client_nodes'], 120)

# TODO check what the length of results is here when you kill clients
scores = []
for (node, generator_pop, discriminator_pop, weights_generator, weights_discriminator) in results:
node_name = '{}:{}'.format(node['address'], node['port'])
Expand Down
1 change: 1 addition & 0 deletions src/training/ea/lipizzaner_gan_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,7 @@ def generate_random_fitness_samples(self, fitness_sample_size):
def get_next_batch(iterator, loaded):
# Handle if the end of iterator is reached
try:
print(f'Iterator: {iterator}')
return next(iterator)[0], iterator
except StopIteration:
# Use a new iterator
Expand Down
Loading