ALFA-group · umustafi · Aug 23, 2019 · Jan 18, 2020 · Jan 18, 2020 · Mar 23, 2020
diff --git a/bashsetup.sh b/bashsetup.sh
@@ -0,0 +1,10 @@
+module load cuda/10.1.168
+module load python/3.6.6-anaconda3-5.3.0
+module load gcc/6.4.0
+module load magma/2.4.0
+module load netlib-lapack/3.8.0
+
+export LD_LIBRARY_PATH=/ccs/proj/csc292/xsy/summit/pytorch-env/lib/:$LD_LIBRARY_PATH
+
+#source /sw/summit/python/3.6/anaconda3/5.3.0/etc/profile.d/conda.sh
+#source activate /autofs/nccs-svm1_proj/lrn002/xsy/pytorch-1.0.0-summit
diff --git a/gan-script.sh b/gan-script.sh
@@ -0,0 +1,29 @@
+cd src/
+
+PID_FILE="gan_pids"
+rm -f ${PID_FILE}
+
+for ((i=0; i<4; i++))
+do 
+    echo "Start client on GPU ${i}"
+    export CUDA_VISIBLE_DEVICES=${i}; 
+    python main.py train --distributed --client &
+    echo $! >> ${PID_FILE}
+    sleep 5;
+done
+
+echo "Client PIDS:"
+cat ${PID_FILE}
+sleep 5
+
+# kill number of clients
+bash client_failure_exp.sh 4
+
+echo "Start master on GPU 4"
+# export CUDA_VISIBLE_DEVICES=4; 
+python main.py train --distributed --master -f configuration/quickstart/mnist.yml
+
+echo "Begin kill clients"
+cat ${PID_FILE} | xargs -I {} kill -9 {}
+
+echo "Done killing clients"
diff --git a/get-clients.sh b/get-clients.sh
@@ -0,0 +1,4 @@
+
+h=$( hostname )
+
+printf "    - address: $h\n      port: 5000-5024\n"
diff --git a/gpu-script.sh b/gpu-script.sh
@@ -0,0 +1,26 @@
+cd src/
+echo $PORT_UPPER_LIMIT
+echo $NUM_CLIENTS_PER_NODE
+h=$( hostname )
+
+#sleep $JSM_NAMESPACE_RANK
+printf "    - address: $h<>      port: 5000-$PORT_UPPER_LIMIT<>" >> configuration/quickstart/general.yml #      port: 5000-$PORT_UPPER_LIMIT\n" >> configuration/quickstart/general.yml 
+
+
+for((i=1; i<=$NUM_CLIENTS_PER_NODE; i++))
+do
+  GPU_ID=$(($i % 6))
+  export CUDA_VISIBLE_DEVICES=$GPU_ID; python main.py train --distributed --client &
+  sleep 5;
+done
+
+
+if (( $JSM_NAMESPACE_RANK == 0 )); then
+
+  sed -i 's/<>/\n/g' configuration/quickstart/general.yml
+
+  printf "\n" >> configuration/quickstart/general.yml 
+  python main.py train --distributed --master -f configuration/quickstart/mnist-gpu.yml
+fi
+
+wait
diff --git a/lipi-mnist-satori.lsf b/lipi-mnist-satori.lsf
@@ -0,0 +1,17 @@
+#BSUB -L /bin/bash
+#BSUB -J "lipi-mnist-satori"
+#BSUB -o "lipi-mnist-satori_o.%J"
+#BSUB -e "lipi-mnist-satori_e.%J"
+#BSUB -n 4
+#BSUB -R "span[ptile=4]"
+#BSUB -gpu "num=4"
+#BSUB -q "normal"
+
+HOME2=/nobackup/users/umustafi
+PYTHON_VIRTUAL_ENVIRONMENT=lipi
+CONDA_ROOT=$HOME2/anaconda3
+source ${CONDA_ROOT}/etc/profile.d/conda.sh
+conda activate $PYTHON_VIRTUAL_ENVIRONMENT
+
+cd $HOME2/projects/lipizzaner-gan/
+bash gan-script.sh
diff --git a/run1.lsf b/run1.lsf
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Begin BSUB Options
+#BSUB -P csc292
+#BSUB -W 01:00
+#BSUB -nnodes 1
+#BSUB -alloc_flags "gpumps"
+#BSUB -J ganwork
+#BSUB -o ganwork.%J
+#BSUB -e ganwork.%J
+
+
+cd $PROJWORK/csc292/ganwork/lipizzaner-gan
+source bashsetup.sh
+source deactivate
+source activate /autofs/nccs-svm1_proj/csc292/xsy/summit/pytorch-env
+
+RUNID=ganrun_000
+
+
+jsrun -n 1 -g 6 -c 42 -r 1 -a 1  bash gan-script.sh
+#jsrun -n 12 -a 1 -c 7 -g 1 -r 6 -l CPU-CPU -d packed -b packed:7 which python
+
diff --git a/rungpu.lsf b/rungpu.lsf
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Begin BSUB Options
+#BSUB -P csc292
+#BSUB -W 02:00
+#BSUB -nnodes 2
+#BSUB -alloc_flags "gpumps"
+#BSUB -J ganwork
+#BSUB -o ganwork.%J
+#BSUB -e ganwork.%J
+
+NODES=2
+export NUM_CLIENTS_PER_NODE=8
+export PORT_UPPER_LIMIT=5007
+
+cd $PROJWORK/csc292/ganwork/lipizzaner-gan
+source bashsetup.sh
+source deactivate
+source activate /autofs/nccs-svm1_proj/csc292/xsy/summit/pytorch-env
+
+RUNID=ganrun_000
+
+jsrun -n $NODES -g 6 -c 42 -r 1 -a 1  bash gpu-script.sh
+#jsrun -n 12 -a 1 -c 7 -g 1 -r 6 -l CPU-CPU -d packed -b packed:7 which python
+
diff --git a/src/configuration/quickstart/general.yml b/src/configuration/quickstart/general.yml
@@ -8,12 +8,14 @@ losswise:
   enabled: False
   api_key: # Fill in API key 
 output_dir: ./output
+seed: 1
+num_workers: 0    # how many subprocesses to use for data loading
 distribution:
   auto_discover: False
   master_node:
     exit_clients_on_disconnect: True
   client_nodes:
-    - address: 127.0.0.1 # Fill in IP address here 
+    - address: 127.0.0.1
       port: 5000-5003
-seed: 1
-num_workers: 0    # how many subprocesses to use for data loading
+      # Non-distributed
+      #port: 0
diff --git a/src/configuration/quickstart/general.yml.bak b/src/configuration/quickstart/general.yml.bak
@@ -0,0 +1,17 @@
+logging:
+  enabled: True
+  log_level: INFO
+  log_server: # Fill in connection string with read/write access here
+  image_format: jpg
+  print_discriminator: False
+losswise:
+  enabled: False
+  api_key: # Fill in API key 
+output_dir: ./output
+seed: 1
+num_workers: 0    # how many subprocesses to use for data loading
+distribution:
+  auto_discover: False
+  master_node:
+    exit_clients_on_disconnect: True
+  client_nodes:
diff --git a/src/configuration/quickstart/mnist-gpu.yml b/src/configuration/quickstart/mnist-gpu.yml
@@ -0,0 +1,39 @@
+trainer:
+  name: lipizzaner_gan
+  n_iterations: 100
+  calculate_net_weights_dist: True
+  # independent_probability, exact_proportion
+  mixture_generator_samples_mode: exact_proportion
+  params:
+    population_size: 1
+    tournament_size: 2
+    n_replacements: 1
+    default_adam_learning_rate: 0.0002
+    # Hyperparameter mutation
+    alpha: 0.0001
+    mutation_probability: 0.5
+    discriminator_skip_each_nth_step: 1
+    mixture_sigma: 0.01
+    enable_selection: True
+    score:
+      enabled: True
+      type: fid
+      score_sample_size: 1000
+      cuda: True
+    fitness:
+      fitness_sample_size: 1000
+      fitness_mode: average    # worse, best, average
+dataloader:
+  dataset_name: mnist
+  use_batch: True
+  batch_size: 400
+  n_batches: 0
+  shuffle: True
+network:
+  name: four_layer_perceptron
+  loss: bceloss
+master:
+  calculate_score: True
+  score_sample_size: 50000
+  cuda: True
+general: !include general.yml
diff --git a/src/configuration/quickstart/mnist.yml b/src/configuration/quickstart/mnist.yml
@@ -6,7 +6,7 @@ trainer:
   mixture_generator_samples_mode: exact_proportion
   params:
     population_size: 1
-    tournament_size: 2
+    tournament_size: 1
     n_replacements: 1
     default_adam_learning_rate: 0.0002
     # Hyperparameter mutation

diff --git a/src/data/mnist_data_loader.py b/src/data/mnist_data_loader.py
@@ -1,4 +1,5 @@
 from torchvision import datasets
+from torchvision.transforms import transforms
 from data.data_loader import DataLoader
 
 
@@ -10,3 +11,9 @@ def __init__(self, use_batch=True, batch_size=100, n_batches=0, shuffle=False):
     @property
     def n_input_neurons(self):
         return 784
+
+    def transform(self):
+        return transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+        ])
diff --git a/src/data/network_data_loader.py b/src/data/network_data_loader.py
@@ -20,7 +20,7 @@
 N_VALUES_PER_RECORD = 4
 
 
-flow_data = np.load("./data/network_data/network_capture.npy")
+#flow_data = np.load("./data/network_data/network_capture.npy")
 
 def generate_random_sequences(num_sequences):
     sequences = []
@@ -62,7 +62,7 @@ class NetworkDataSet(Dataset):
     def __init__(self, **kwargs):
         self.data = flow_data
         print("Packets Array Size: ", self.data.shape)
-        
+
     def __getitem__(self, index):
         return self.data[index]
 

diff --git a/src/distribution/client_api.py b/src/distribution/client_api.py
@@ -52,6 +52,7 @@ def terminate_experiment():
         ClientAPI._lock.acquire()
 
         if ClientAPI.is_busy:
+            # write more log messages to check if killing clients successfully 
             ClientAPI._logger.warning('Received stop signal from master, experiment will be quit.')
             ClientAPI._stop_event.set()
         else:

diff --git a/src/distribution/concurrent_populations.py b/src/distribution/concurrent_populations.py
@@ -15,7 +15,10 @@ def lock(self):
         self._lock.acquire()
 
     def unlock(self):
-        self._lock.release()
+        try:
+            self._lock.release()
+        except RuntimeError:
+            pass
 
     @property
     def generator(self):

diff --git a/src/distribution/node_client.py b/src/distribution/node_client.py
@@ -128,6 +128,30 @@ def stop_running_experiments(self, except_for_clients=None):
             address = 'http://{}:{}/experiments'.format(client['address'], client['port'])
             requests.delete(address)
 
+    # will kill up to num_clients_to_kill if they exist
+    def kill_clients(self, num_clients_to_kill):
+        clients = self.cc.settings['general']['distribution']['client_nodes']
+        active_clients = [c for c in clients if not any(d for d in except_for_clients if d['address'] == c['address']
+                                                        and d['port'] == c['port'])]
+        killed = 0 
+        NodeClient._logger.info("clients are: {} \nactive clients are: {}".format(clients, active_clients)) 
+
+        for client in active_clients:
+            if killed > num_clients_to_kill:
+                break 
+            address = 'http://{}:{}/experiments'.format(client['address'], client['port'])
+            requests.delete(address)
+            # attempt to do a request to this address to see if its alive 
+            NodeClient._logger.info("deleted {}\n".format(address))
+            killed += 1
+
+        active_clients_after = [c for c in clients if not any(d for d in except_for_clients if d['address'] == c['address']
+                                                        and d['port'] == c['port'])]
+
+        assert(len(active_clients_after) == len(active_clients_after) - killed)
+        NodeClient._logger.info("actually killed {} of desired killed {}, num clients is {}".format(len(active_clients_after),killed, num_clients_to_kill))
+
+
     @staticmethod
     def _load_parameters_async(node, path, timeout_sec):
         address = 'http://{}:{}{}'.format(node['address'], node['port'], path)

diff --git a/src/helper_files/requirements-ornl.txt b/src/helper_files/requirements-ornl.txt
@@ -0,0 +1,9 @@
+torchvision==0.2.0
+losswise==2.0
+PyYAML>=4.1b
+matplotlib==2.2.2
+scipy==1.0.1
+Flask==0.12.2
+netaddr==0.7.19
+pymongo==3.6.1
+netifaces==0.10.9
diff --git a/src/helpers/pytorch_helpers.py b/src/helpers/pytorch_helpers.py
@@ -70,4 +70,4 @@ def calculate_net_weights_dist(net1, net2):
     for net1_layer_weights, net2_layer_weights in zip(net1.parameters(), net2.parameters()):
         l2_dist += torch.sum((net1_layer_weights - net2_layer_weights)**2)
 
-    return torch.sqrt(l2_dist).data.cpu().numpy()[0]
+    return torch.sqrt(l2_dist).data.cpu().numpy()#[0]
diff --git a/src/helpers/singleton.py b/src/helpers/singleton.py
@@ -6,12 +6,14 @@ def __init__(self, decorated):
     def instance(self):
         try:
             return self._instance
-        except AttributeError:
+        except AttributeError as e:
+            print(e)
+            print("Instansiate")
             self._instance = self._decorated()
             return self._instance
 
     def __call__(self):
         raise TypeError('Singletons must be accessed through `instance()`.')
 
     def __instancecheck__(self, inst):
-        return isinstance(inst, self._decorated)
+        return isinstance(inst, self._decorated)
diff --git a/src/lipizzaner_master.py b/src/lipizzaner_master.py
@@ -79,6 +79,11 @@ def run(self):
 
         self.heartbeat_thread.join()
 
+        # TODO set a timer here that after a certain interval will terminate X number of clients
+        # self._logger.info("about to call node function to kill clients")
+        # self._kill_clients(1)
+        # time.sleep(25)
+
         # When this is reached, the heartbeat thread has stopped.
         # This either happens when the experiments are done, or if they were terminated
         if self.heartbeat_thread.success:
@@ -96,7 +101,9 @@ def _accessible_clients(self, clients):
             assert client['address'] is not None
             address = 'http://{}:{}/status'.format(client['address'], client['port'])
             try:
+                print(f'{address}')
                 resp = requests.get(address)
+                print(f'{resp}')
                 assert resp.status_code == 200
                 assert not resp.json()['busy']
                 accessible_clients.append(client)
@@ -155,6 +162,10 @@ def _terminate(self, stop_clients=True, return_code=-1):
 
             exit(return_code)
 
+    def _kill_clients(self, num_to_kill=0):
+        node_client = NodeClient(None)
+        node_client.kill_clients(num_to_kill)
+
     def _gather_results(self):
         self._logger.info('Collecting results from clients...')
 
@@ -165,7 +176,7 @@ def _gather_results(self):
         db_logger = DbLogger()
 
         results = node_client.gather_results(self.cc.settings['general']['distribution']['client_nodes'], 120)
-
+        # TODO check what the length of results is here when you kill clients 
         scores = []
         for (node, generator_pop, discriminator_pop, weights_generator, weights_discriminator) in results:
             node_name = '{}:{}'.format(node['address'], node['port'])

diff --git a/src/training/ea/lipizzaner_gan_trainer.py b/src/training/ea/lipizzaner_gan_trainer.py
@@ -357,6 +357,7 @@ def generate_random_fitness_samples(self, fitness_sample_size):
         def get_next_batch(iterator, loaded):
             # Handle if the end of iterator is reached
             try:
+                print(f'Iterator: {iterator}')
                 return next(iterator)[0], iterator
             except StopIteration:
                 # Use a new iterator
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@

		h=$( hostname )

		printf " - address: $h\n port: 5000-5024\n"