1313
1414def fit (obj , tf_iter , newton_iter , batch_sz = None , newton_eager = True ):
1515 obj .u_model = neural_net (obj .layer_sizes )
16- #obj.build_loss()
16+ # obj.build_loss()
1717 # Can adjust batch size for collocation points, here we set it to N_f
1818 if batch_sz is not None :
1919 obj .batch_sz = batch_sz
2020 else :
2121 obj .batch_sz = obj .X_f_len
22- #obj.batch_sz = len(obj.x_f)
22+ # obj.batch_sz = len(obj.x_f)
2323
2424 N_f = obj .X_f_len
25- #N_f = len(obj.x_f)
25+ # N_f = len(obj.x_f)
2626 n_batches = int (N_f // obj .batch_sz )
2727 start_time = time .time ()
2828 obj .tf_optimizer = tf .keras .optimizers .Adam (lr = 0.005 , beta_1 = .99 )
@@ -42,7 +42,7 @@ def fit(obj, tf_iter, newton_iter, batch_sz=None, newton_eager=True):
4242 if epoch % 100 == 0 :
4343 elapsed = time .time () - start_time
4444 print ('It: %d, Time: %.2f' % (epoch , elapsed ))
45- #tf.print(f"mse_0: {mse_0} mse_b {mse_b} mse_f: {mse_f} total loss: {loss_value}")
45+ # tf.print(f"mse_0: {mse_0} mse_b {mse_b} mse_f: {mse_f} total loss: {loss_value}")
4646 tf .print (f"total loss: { loss_value } " )
4747 start_time = time .time ()
4848 # tf.profiler.experimental.stop()
@@ -98,12 +98,21 @@ def train_op(obj, n_batches):
9898 return loss_value
9999
100100
101+ # TODO Distributed training re-integration
102+
101103def fit_dist (obj , tf_iter , newton_iter , batch_sz = None , newton_eager = True ):
102104 BUFFER_SIZE = len (obj .x_f )
103105 EPOCHS = tf_iter
104106 # devices = ['/gpu:0', '/gpu:1','/gpu:2', '/gpu:3'],
105- obj .strategy = tf .distribute .MirroredStrategy (devices = ['/gpu:0' , '/gpu:1' ,'/gpu:2' , '/gpu:3' ])
106- print ("number of devices: {}" .format (obj .strategy .num_replicas_in_sync ))
107+ try :
108+ obj .strategy = tf .distribute .MirroredStrategy ()
109+ except :
110+ print ("Looks like we cant find any GPUs available, or your GPUs arent responding to Tensorflow's API. If "
111+ "you're receiving this in error, check that your CUDA, "
112+ "CUDNN, and other GPU dependencies are installed correctly with correct versioning based on your "
113+ "version of Tensorflow" )
114+
115+ print ("Number of GPU devices: {}" .format (obj .strategy .num_replicas_in_sync ))
107116
108117 if batch_sz is not None :
109118 obj .batch_sz = batch_sz
@@ -160,15 +169,16 @@ def train_step(obj, inputs):
160169 obj .dist_col_weights = tf .gather (obj .col_weights , col_idx )
161170 print (obj .dist_col_weights )
162171 obj .variables .extend ([obj .u_weights , obj .dist_col_weights ])
163- loss_value , mse_0 , mse_b , mse_f , grads = obj .grad ()
172+ loss_value , grads = obj .grad ()
164173 obj .tf_optimizer .apply_gradients (zip (grads [:- 2 ], obj .u_model .trainable_variables ))
165174 print ([grads [- 2 ], grads [- 1 ]])
166175 obj .tf_optimizer_weights .apply_gradients (
167176 zip ([- grads [- 2 ], - grads [- 1 ]], [obj .u_weights , obj .dist_col_weights ]))
168- tf .scatter_nd_add (obj .col_weights , col_idx , obj .dist_col_weights )
177+ # TODO collocation weight splitting across replicas
178+ # tf.scatter_nd_add(obj.col_weights, col_idx, obj.dist_col_weights)
169179 else :
170180 obj .variables = obj .u_model .trainable_variables
171- loss_value , mse_0 , mse_b , mse_f , grads = obj .grad ()
181+ loss_value , grads = obj .grad ()
172182 obj .tf_optimizer .apply_gradients (zip (grads , obj .u_model .trainable_variables ))
173183 return loss_value
174184
0 commit comments