changing fresh_start methods of models to use class train method instead

Jordan Stomps · Jordan Stomps · commit 5e050cf5cbcb · 2022-09-29T11:12:15.000-04:00
diff --git a/models/LogReg.py b/models/LogReg.py
@@ -61,23 +61,16 @@ def fresh_start(self, params, data_dict):
         testy = data_dict['testy']
 
         # supervised logistic regression
-        clf = linear_model.LogisticRegression(
-                random_state=self.random_state,
-                max_iter=params['max_iter'],
-                tol=params['tol'],
-                C=params['C']
-              )
+        clf = LogReg(params=params, random_state=self.random_state)
         # train and test model
-        clf.fit(trainx, trainy)
-        clf_pred = clf.predict(testx)
-        # balanced_accuracy accounts for class imbalanced data
-        # could alternatively use pure accuracy for a more traditional hyperopt
-        acc = balanced_accuracy_score(testy, clf_pred)
+        clf.train(trainx, trainy)
+        # uses balanced_accuracy accounts for class imbalanced data
+        clf_pred, acc = clf.predict(testx, testy)
 
         # loss function minimizes misclassification
         return {'loss': 1-acc,
                 'status': STATUS_OK,
-                'model': clf,
+                'model': clf.model,
                 'params': params,
                 'accuracy': acc}
 
diff --git a/models/SSML/CoTraining.py b/models/SSML/CoTraining.py
@@ -35,6 +35,8 @@ def __init__(self, params=None, random_state=0):
                             random_state=self.random_state)
             self.model2 = linear_model.LogisticRegression(
                             random_state=self.random_state)
+            # default needed for training
+            self.params = {'n_samples': 1}
         else:
             self.model1 = linear_model.LogisticRegression(
                             random_state=self.random_state,
@@ -152,60 +154,17 @@ def fresh_start(self, params, data_dict):
         testy = data_dict['testy']
         # unlabeled co-training data
         Ux = data_dict['Ux']
-        # avoid overwriting when deleting in co-training loop
-        U_lr = Ux.copy()
-
-        # set the random seed of training splits for reproducibility
-        # This can be ignored by excluding params['seed']
-        # in the hyperopt space dictionary
-        if 'seed' in params.keys():
-            np.random.seed(params['seed'])
-
-        # TODO: allow a user to specify uneven splits between the two models
-        split_frac = 0.5
-        # labeled training data
-        idx = np.random.choice(range(trainy.shape[0]),
-                               size=int(split_frac * trainy.shape[0]),
-                               replace=False)
 
-        # avoid overwriting when deleting in co-training loop
-        L_lr1 = trainx[idx].copy()
-        L_lr2 = trainx[~idx].copy()
-        Ly_lr1 = trainy[idx].copy()
-        Ly_lr2 = trainy[~idx].copy()
+        clf = CoTraining(params=params, random_state=self.random_state)
+        # training and testing
+        model1_accs, model2_accs = clf.train(trainx, trainy, Ux, testx, testy)
+        # uses balanced_accuracy accounts for class imbalanced data
+        pred1, acc, pred2, model1_acc, model2_acc = clf.predict(testx, testy)
 
-        # initialized logistic regression models for a fresh-start
-        slr1 = linear_model.LogisticRegression(
-                random_state=self.random_state,
-                max_iter=params['max_iter'],
-                tol=params['tol'],
-                C=params['C']
-            )
-        slr2 = linear_model.LogisticRegression(
-                random_state=self.random_state,
-                max_iter=params['max_iter'],
-                tol=params['tol'],
-                C=params['C']
-            )
-
-        slr1, slr2, model1_accs, model2_accs = self.training_loop(
-                                                slr1, slr2,
-                                                L_lr1, L_lr2,
-                                                Ly_lr1, Ly_lr2,
-                                                U_lr, params['n_samples'],
-                                                testx, testy,
-                                                )
-
-        # balanced_accuracy accounts for class imbalanced data
-        # could alternatively use pure accuracy for a more traditional hyperopt
-        model1_acc = balanced_accuracy_score(testy, slr1.predict(testx))
-        model2_acc = balanced_accuracy_score(testy, slr2.predict(testx))
-        # select best accuracy for hyperparameter optimization
-        acc = max(model1_acc, model2_acc)
         return {'loss': 1-acc,
                 'status': STATUS_OK,
-                'model': slr1,
-                'model2': slr2,
+                'model': clf.model1,
+                'model2': clf.model2,
                 'model1_acc_history': model1_accs,
                 'model2_acc_history': model2_accs,
                 'params': params,
@@ -262,7 +221,7 @@ def optimize(self, space, data_dict, max_evals=50, verbose=True):
         self.worst = worst
 
     def train(self, trainx, trainy, Ux,
-              testx=None, testy=None, n_samples=1, seed=None):
+              testx=None, testy=None):
         '''
         Wrapper method for a basic co-training with logistic regression
         implementation training method.
@@ -274,9 +233,6 @@ def train(self, trainx, trainy, Ux,
             of each model at every iteration.
         testy: label vector used for testing the performance
             of each model at every iteration.
-        n_samples: the number of instances to sample and
-            predict from Ux at one time
-        seed: set the random seed of training splits for reproducibility
         '''
 
         # avoid overwriting when deleting in co-training loop
@@ -285,8 +241,8 @@ def train(self, trainx, trainy, Ux,
         # set the random seed of training splits for reproducibility
         # This can be ignored by excluding params['seed']
         # in the hyperopt space dictionary
-        if seed is not None:
-            np.random.seed(seed)
+        if 'seed' in self.params.keys():
+            np.random.seed(self.params['seed'])
 
         # TODO: allow a user to specify uneven splits between the two models
         split_frac = 0.5
@@ -306,7 +262,7 @@ def train(self, trainx, trainy, Ux,
                                 self.model1, self.model2,
                                 L_lr1, L_lr2,
                                 Ly_lr1, Ly_lr2,
-                                U_lr, n_samples,
+                                U_lr, self.params['n_samples'],
                                 testx, testy,
                                 )
 
diff --git a/models/SSML/LabelProp.py b/models/SSML/LabelProp.py
@@ -72,32 +72,16 @@ def fresh_start(self, params, data_dict):
         testy = data_dict['testy']
         Ux = data_dict['Ux']
 
-        # combine labeled and unlabeled instances for training
-        lp_trainx = np.append(trainx, Ux, axis=0)
-        lp_trainy = np.append(trainy,
-                              np.full(shape=(Ux.shape[0],), fill_value=-1),
-                              axis=0)
-
-        # semi-supervised label propagation
-        clf = semi_supervised.LabelPropagation(
-                kernel='knn',
-                gamma=params['gamma'],
-                n_neighbors=params['n_neighbors'],
-                max_iter=params['max_iter'],
-                tol=params['tol'],
-                n_jobs=-1
-            )
-        # train and test model
-        clf.fit(lp_trainx, lp_trainy)
-        clf_pred = clf.predict(testx)
-        # balanced_accuracy accounts for class imbalanced data
-        # could alternatively use pure accuracy for a more traditional hyperopt
-        acc = balanced_accuracy_score(testy, clf_pred)
+        clf = LabelProp(params, random_state=self.random_state)
+        # training and testing
+        clf.train(trainx, trainy, Ux)
+        # uses balanced_accuracy accounts for class imbalanced data
+        pred, acc = clf.predict(testx, testy)
 
         # loss function minimizes misclassification
         return {'loss': 1-acc,
                 'status': STATUS_OK,
-                'model': clf,
+                'model': clf.model,
                 'params': params,
                 'accuracy': acc}
 
diff --git a/models/SSML/ShadowCNN.py b/models/SSML/ShadowCNN.py
@@ -207,72 +207,18 @@ def fresh_start(self, params, data_dict):
         # unlabeled co-training data
         Ux = data_dict['Ux']
 
-        # avoid float round-off by using DoubleTensor
-        xtens = torch.FloatTensor(np.append(trainx,
-                                            Ux,
-                                            axis=0))[:, ::params['binning']]
-        # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
-        ytens = torch.LongTensor(np.append(trainy,
-                                           np.full(shape=(Ux.shape[0],),
-                                                   fill_value=-1),
-                                           axis=0))
-
-        model = Net(layer1=params['layer1'],
-                    layer2=2*params['layer1'],
-                    layer3=3*params['layer1'],
-                    kernel=params['kernel'],
-                    drop_rate=params['drop_rate'],
-                    length=np.ceil(trainx.shape[1]/params['binning']))
-        eaat = shadow.eaat.EAAT(model=model,
-                                alpha=params['alpha'],
-                                xi=params['xi'],
-                                eps=params['eps'])
-        optimizer = optim.SGD(eaat.parameters(),
-                              lr=params['lr'],
-                              momentum=params['momentum'])
-
-        # define data set object
-        dataset = SpectralDataset(xtens, ytens)
-
-        # create DataLoader object of DataSet object
-        DL_DS = torch.utils.data.DataLoader(dataset,
-                                            batch_size=params['batch_size'],
-                                            shuffle=True)
-
-        # labels for unlabeled data are always "-1"
-        xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1)
-
-        n_epochs = 100
-        eaat.to(self.device)
-        losscurve = []
-        evalcurve = []
-        for epoch in range(n_epochs):
-            eaat.train()
-            lossavg = []
-            for i, (data, targets) in enumerate(DL_DS):
-                x = data.reshape((data.shape[0],
-                                  1,
-                                  data.shape[1])).to(self.device)
-                y = targets.to(self.device)
-                optimizer.zero_grad()
-                out = eaat(x)
-                loss = xEnt(out, y) + eaat.get_technique_cost(x)
-                loss.backward()
-                optimizer.step()
-                lossavg.append(loss.item())
-            losscurve.append(np.nanmedian(lossavg))
-            if testx is not None and testy is not None:
-                pred, acc = self.predict(testx,
-                                         testy,
-                                         eaat)
-                evalcurve.append(acc)
-
-        if testx is not None and testy is not None:
-            max_acc = np.max(evalcurve[-25:])
+        clf = ShadowCNN(params=params,
+                        random_state=self.random_state,
+                        length=trainx.shape[1])
+        # training and testing
+        losscurve, evalcurve = clf.train(trainx, trainy, Ux, testx, testy)
+        # not used; max acc in past few epochs used instead
+        y_pred, acc = clf.predict(testx, testy)
+        max_acc = np.max(evalcurve[-25:])
 
         return {'loss': 1-(max_acc/100.0),
                 'status': STATUS_OK,
-                'model': eaat,
+                'model': clf.eaat,
                 'params': params,
                 'losscurve': losscurve,
                 'evalcurve': evalcurve,
@@ -396,15 +342,13 @@ def train(self, trainx, trainy, Ux, testx=None, testy=None):
                 lossavg.append(loss.item())
             losscurve.append(np.nanmedian(lossavg))
             if testx is not None and testy is not None:
-                pred, acc = self.predict(testx,
-                                         testy,
-                                         self.eaat)
+                pred, acc = self.predict(testx, testy)
                 evalcurve.append(acc)
 
         # optionally return the training accuracy if test data was provided
         return losscurve, evalcurve
 
-    def predict(self, testx, testy=None, eaat=None):
+    def predict(self, testx, testy=None):
         '''
         Wrapper method for Shadow NN predict method.
         Inputs:
@@ -413,21 +357,15 @@ def predict(self, testx, testy=None, eaat=None):
             optional: if included, the predicted classes -and-
             the resulting classification accuracy will be returned.
         binning: int number of bins sampled in feature vector
-        model: optional input for testing a given model in hyperparameter
-            optimization rather than the class saved model.
         '''
 
-        if eaat is not None:
-            eval_model = eaat
-        else:
-            eval_model = self.eaat
-        eval_model.eval()
+        self.eaat.eval()
         y_pred, y_true = [], []
         for i, data in enumerate(torch.FloatTensor(
                                     testx.copy()[:, ::self.params['binning']])
                                  ):
             x = data.reshape((1, 1, data.shape[0])).to(self.device)
-            out = eval_model(x)
+            out = self.eaat(x)
             y_pred.extend(torch.argmax(out, 1).detach().cpu().tolist())
         acc = None
         if testy is not None:
diff --git a/models/SSML/ShadowNN.py b/models/SSML/ShadowNN.py
@@ -104,59 +104,18 @@ def fresh_start(self, params, data_dict):
         # unlabeled co-training data
         Ux = data_dict['Ux']
 
-        eaat = shadow.eaat.EAAT(model=self.model_factory(
-                                    testx[:, ::params['binning']].shape[1],
-                                    params['hidden_layer']),
-                                alpha=params['alpha'],
-                                xi=params['xi'],
-                                eps=params['eps']).to(self.device)
-        eaat_opt = torch.optim.SGD(eaat.parameters(),
-                                   lr=params['lr'],
-                                   momentum=params['momentum'])
-        xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1).to(self.device)
-
-        # avoid float round-off by using DoubleTensor
-        xtens = torch.FloatTensor(np.append(trainx,
-                                            Ux,
-                                            axis=0)[:, ::params['binning']])
-        # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
-        ytens = torch.LongTensor(np.append(trainy,
-                                           np.full(shape=(Ux.shape[0],),
-                                                   fill_value=-1),
-                                           axis=0))
-
-        n_epochs = 100
-        xt = torch.Tensor(xtens).to(self.device)
-        yt = torch.LongTensor(ytens).to(self.device)
-        # saves history for max accuracy
-        acc_history = []
-        # set the model into training mode
-        # NOTE: change this to .eval() mode for testing and back again
-        eaat.train()
-        for epoch in range(n_epochs):
-            # Forward/backward pass for training semi-supervised model
-            out = eaat(xt)
-            # supervised + unsupervised loss
-            loss = xEnt(out, yt) + eaat.get_technique_cost(xt)
-            eaat_opt.zero_grad()
-            loss.backward()
-            eaat_opt.step()
-
-            eaat.eval()
-            eaat_pred = torch.max(eaat(
-                                    torch.FloatTensor(
-                                        testx.copy()[:, ::params['binning']]
-                                        )
-                                    ), 1)[-1]
-            acc = shadow.losses.accuracy(eaat_pred,
-                                         torch.LongTensor(testy.copy())
-                                         ).data.item()
-            acc_history.append(acc)
+        clf = ShadowNN(params=params,
+                       random_state=self.random_state,
+                       input_length=testx.shape[1])
+        # training and testing
+        acc_history = clf.train(trainx, trainy, Ux, testx, testy)
+        # not used; max acc in past few epochs used instead
+        eaat_pred, acc = clf.predict(testx, testy)
         max_acc = np.max(acc_history[-20:])
 
         return {'loss': 1-(max_acc/100.0),
                 'status': STATUS_OK,
-                'model': eaat,
+                'model': clf.eaat,
                 'params': params,
                 'accuracy': (max_acc/100.0)}
 
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -169,7 +169,7 @@ def test_CoTraining():
 
     # default behavior
     model = CoTraining(params=None, random_state=0)
-    model.train(X_train, y_train, Ux, seed=0)
+    model.train(X_train, y_train, Ux)
 
     # testing train and predict methods
     pred, acc, *_ = model.predict(X_test, y_test)