|
| 1 | +#this module is alone and not used by others. |
| 2 | +#Uses two datasets: lfw and gtdb |
| 3 | +import datetime |
| 4 | +import os |
| 5 | +import shutil |
| 6 | +#from sklearn.neighbors import KNeighborsClassifier |
| 7 | +from argparse import ArgumentParser |
| 8 | + |
| 9 | +import cv2 |
| 10 | +#from queue import Queue #FIFO |
| 11 | +#from threading import Thread |
| 12 | +import numpy as np |
| 13 | +#from sklearn.metrics import confusion_matrix |
| 14 | +from sklearn.linear_model import LogisticRegression |
| 15 | +from sklearn.model_selection import StratifiedKFold |
| 16 | + |
| 17 | +#from utils import progress_bar |
| 18 | +from biocapsule import BioCapsuleGenerator |
| 19 | +from face import ArcFace, extract_dataset |
| 20 | + |
| 21 | +np.random.seed(42) |
| 22 | + |
| 23 | +def filter_lfw(features): #only used in this module; second input features_flip removed by Kai |
| 24 | + y = np.unique(features[:, -1]) |
| 25 | + mask = np.ones(features[:, -1].shape, dtype=bool) |
| 26 | + for y_i in y: |
| 27 | + if features[features[:, -1] == y_i].shape[0] < 5: |
| 28 | + idxes = np.where(features[:, -1] == y_i) |
| 29 | + mask[idxes] = False |
| 30 | + features = features[mask] |
| 31 | + #features_flip = features_flip[mask] |
| 32 | + |
| 33 | + y_map = {} |
| 34 | + y = np.unique(features[:, -1]) |
| 35 | + for i, y_i in enumerate(y): |
| 36 | + y_map[y_i] = i + 1 |
| 37 | + |
| 38 | + for i in range(features[:, -1].shape[0]): |
| 39 | + features[i, -1] = y_map[features[i, -1]] |
| 40 | + #features_flip[i, -1] = y_map[features_flip[i, -1]] |
| 41 | + |
| 42 | + #return features, features_flip #second return value features_flip removed by Kai |
| 43 | + return features |
| 44 | + |
| 45 | +#returns a 2D array of 6 by 512 |
| 46 | +def get_rs_features(): #only used in this module |
| 47 | + arcface = ArcFace() #an object of ArcFace class; this can be customized with a different feature extraction method |
| 48 | + |
| 49 | + # if os.path.isdir(os.path.join(os.path.abspath(""), "images", "rs_aligned")): |
| 50 | + # shutil.rmtree(os.path.join(os.path.abspath(""), "images", "rs_aligned")) |
| 51 | + |
| 52 | + rs_features = np.zeros((6, 512)) |
| 53 | + #os.mkdir(os.path.join(os.path.abspath(""), "images", "rs_aligned")) |
| 54 | + for s_id, subject in enumerate(os.listdir(os.path.join(os.path.abspath(""), "images", "rs"))[4:]): #here listdir should return a list of 10 directory names rs_00 to rs_09 |
| 55 | + for image in os.listdir(os.path.join(os.path.abspath(""), "images", "rs", subject)): #image will be sth like rs_04.jpg ... rs_09.jpg; subject is rs_04 ... rs_09 |
| 56 | + img = cv2.imread(os.path.join(os.path.abspath(""), "images", "rs", subject, image)) #img is of class 'numpy.ndarray' |
| 57 | + img_aligned = arcface.preprocess(img) #get an aligned image with just facial region (five facial landmarks) |
| 58 | + feature = arcface.extract(img_aligned, align=False) #the return value of extract here should be a row vector of 512 elements |
| 59 | + rs_features[s_id] = feature |
| 60 | + |
| 61 | + if img_aligned.shape != (3, 112, 112): #this is unnecessary since extract function has already done this? |
| 62 | + img_aligned = cv2.resize(img_aligned, (112, 112)) |
| 63 | + img_aligned = np.rollaxis(cv2.cvtColor(img_aligned, cv2.COLOR_RGB2BGR), 2, 0) |
| 64 | + |
| 65 | + #cv2.imwrite(os.path.join(os.path.abspath(""), "images", "rs_aligned", image), cv2.cvtColor(np.rollaxis(img_aligned, 0, 3), cv2.COLOR_RGB2BGR)) |
| 66 | + |
| 67 | + return rs_features |
| 68 | + |
| 69 | +#yLen is the number of subjects (LFW: 423; GTDB: 50) |
| 70 | +#return a vector of random values (0~5) of length yLen |
| 71 | +def rs_rbac(yLen, dist): #only used in this module; REVISED BY KAI |
| 72 | + if dist == "unbal": |
| 73 | + rs_map = np.random.choice(6, yLen, p=[0.05, 0.1, 0.15, 0.2, 0.25, 0.25]) |
| 74 | + else: |
| 75 | + rs_map = np.random.choice(6, yLen) |
| 76 | + return rs_map |
| 77 | + |
| 78 | +#return biocapsules |
| 79 | +#input features is a 2D array: number of rows is the image count; number of columns is 513 |
| 80 | +#input rs_features is of shape 6 by 512 |
| 81 | +#original input rs_map is also removed by Kai |
| 82 | +def get_bcs(features, rs_features): #only used in this module; #second input features_flip and second return value bcs_flip removed by Kai |
| 83 | + bcs = np.zeros((rs_features.shape[0], features.shape[0], 513)) # 3D array of 6 by image_count by 513 |
| 84 | + #bcs_flip = np.zeros((rs_features.shape[0], features_flip.shape[0], 513)) |
| 85 | +#features[:, :-1] is of shape image_count by 512 |
| 86 | + bc_gen = BioCapsuleGenerator() |
| 87 | + for i in range(rs_features.shape[0]): #i: 0~5 |
| 88 | + bcs[i, :, :] = np.hstack([bc_gen.biocapsule_batch(features[:, :-1], rs_features[i]), features[:, -1][:, np.newaxis]]) #note: the features 2D array will be updated here (but its last column remains the same)! |
| 89 | + #bcs_flip[i, :, :] = np.hstack([bc_gen.biocapsule_batch(features_flip[:, :-1], rs_features[i]), features_flip[:, -1][:, np.newaxis]]) |
| 90 | +#last column features[:, -1][:, np.newaxis] is subject_id |
| 91 | + #return bcs, bcs_flip #second return value bcs_flip removed by Kai |
| 92 | + return bcs |
| 93 | + |
| 94 | +if __name__ == "__main__": |
| 95 | + parser = ArgumentParser() |
| 96 | + parser.add_argument("-d", "--dataset", required=True, choices=["lfw", "gtdb"], help="dataset to use in experiment") |
| 97 | + parser.add_argument("-m", "--mode", required=True, choices=["under", "bc"], help="feature mode to use in experiment") |
| 98 | + parser.add_argument("-r", "--role_dist", required=False, choices=["bal", "unbal"], default="unbal", help="role distribution to use in experiment") |
| 99 | + parser.add_argument("-t", "--thread_cnt", required=False, type=int, default=1, help="thread count to use in classifier training") |
| 100 | + parser.add_argument("-gpu", "--gpu", required=False, type=int, default=-1, help="gpu to use in feature extraction") |
| 101 | + args = vars(parser.parse_args()) |
| 102 | + |
| 103 | + if args["mode"] == "under": |
| 104 | + fi = open(os.path.join(os.path.abspath(""), "results", "tps2020_{}_under.txt".format(args["dataset"])), "w") |
| 105 | + else: |
| 106 | + fi = open(os.path.join(os.path.abspath(""), "results", "tps2020_{}_bc_{}.txt".format(args["dataset"], args["role_dist"])), "w") |
| 107 | + print("computing features:",datetime.datetime.now()) |
| 108 | + # extract features for experiment: extract_dataset is in face.py |
| 109 | + if args["dataset"]=="lfw": |
| 110 | + features = np.load(os.path.join(os.path.abspath(""), "data", "lfw_arcface_feat.npz"))["arr_0"] |
| 111 | + elif args["dataset"]=="gtdb": |
| 112 | + features = np.load(os.path.join(os.path.abspath(""), "data", "gtdb_arcface_feat.npz"))["arr_0"] |
| 113 | + else: |
| 114 | + features = extract_dataset(args["dataset"], "arcface", args["gpu"]) #second return value features_flip removed by Kai |
| 115 | +# features is a 2D array: number of rows is the image count; number of columns is 513 (last column is 1-based subject_id). Each row is a feature vector plus subject_id. |
| 116 | + print("done computing features",datetime.datetime.now()) |
| 117 | + # remove all subjects with less than 5 images from LFW dataset |
| 118 | + if args["dataset"] == "lfw": #filter_lfw is in this module |
| 119 | + print("filtering lfw features.") |
| 120 | + features = filter_lfw(features) #second input and return value features_flip removed by Kai |
| 121 | + |
| 122 | + # if biocapsules are used, we can perform authn-authz operation using reference subjects |
| 123 | + if args["mode"] == "bc": |
| 124 | + # get reference subjects for roles; get_rs_features is in this module |
| 125 | + print("computing bcs:",datetime.datetime.now()) |
| 126 | + rs_features = get_rs_features() #a 2D array of 6 by 512 |
| 127 | + |
| 128 | + # assign subjects their reference subjects/roles; rs_rbac is in this module |
| 129 | + rs_map = rs_rbac(len(np.unique(features[:, -1])), args["role_dist"]) #each element (0~5) of the vector rs_map (of length number_of_subjects) represents a reference_subject/role for a subject |
| 130 | + cnts = np.unique(rs_map, return_counts=True)[1] |
| 131 | + for i, cnt in enumerate(cnts): #histogram: how many subjects for each role 0~5 |
| 132 | + fi.write("Role {} -- {} Subjects\n".format(i + 1, cnt)) |
| 133 | + |
| 134 | + # create all possible biocapsules: get_bcs is in this module; note: features will get updated by the get_bcs call |
| 135 | + bcs = get_bcs(features, rs_features) #second input features_flip and fourth input rs_map and second return value bcs_flip removed by Kai |
| 136 | + |
| 137 | + # tn, fp, fn, tp |
| 138 | + #conf_mat = np.zeros((4,)) |
| 139 | + ctp=0 |
| 140 | + ctn=0 |
| 141 | + cfp=0 |
| 142 | + cfn=0 |
| 143 | + cfp1=0 |
| 144 | + print("begin skf...",datetime.datetime.now()) |
| 145 | + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) |
| 146 | + for k, (train_index, test_index) in enumerate(skf.split(features[:, :-1], features[:, -1])): #k will be 0 to 4; test_index is a vector of length image_count/5; train_index is a vector of length image_count*4/5 |
| 147 | + print("fold",k) |
| 148 | + if args["mode"] == "under": |
| 149 | + X_train = features[:, :-1][train_index] #2D array of shape train_image_count by 512 |
| 150 | + y_train = features[:, -1][train_index] #a vector of subject_id's of length train_image_count |
| 151 | + X_test = features[:, :-1][test_index] #2D array of shape test_image_count by 512 |
| 152 | + y_test = features[:, -1][test_index] #a vector of subject_id's of length test_image_count |
| 153 | + # labels = np.unique(y_train) #a vector of unique subject_id's |
| 154 | + # labels_test=np.unique(y_test) |
| 155 | + # assert labels.size==labels_test.size |
| 156 | + # knn = KNeighborsClassifier() #typically no better than LR? |
| 157 | + # print("fold",k,"KNN score:", knn.fit(X_train, y_train).score(X_test, y_test)) |
| 158 | + clf = LogisticRegression(class_weight="balanced", random_state=42).fit(X_train, y_train) |
| 159 | + #print("fold",k,"LR score:", clf.score(X_test, y_test)) |
| 160 | + y_pred=clf.predict(X_test) |
| 161 | + for j in range(len(test_index)): |
| 162 | + if y_pred[j]==y_test[j]: |
| 163 | + ctp=ctp+1 |
| 164 | + else: |
| 165 | + cfn=cfn+1 |
| 166 | + cfp=cfp+1 |
| 167 | + else: #args["mode"] == "bc" |
| 168 | + for i in range(len(rs_features)): #i: 0~5 |
| 169 | + X_train = bcs[i, :, :-1][train_index] |
| 170 | + y_train = bcs[i, :, -1][train_index] #based on bcs construction, equivalent to features[:, -1][train_index] |
| 171 | + X_test = bcs[i, :, :-1][test_index] |
| 172 | + y_test = bcs[i, :, -1][test_index] #based on bcs construction, equivalent to features[:, -1][test_index] |
| 173 | + # knn = KNeighborsClassifier() #typically no better than LR? |
| 174 | + # print("fold",k,"rs",i,"KNN score:", knn.fit(X_train, y_train).score(X_test, y_test)) |
| 175 | + clf = LogisticRegression(class_weight="balanced", random_state=42).fit(X_train, y_train) |
| 176 | + #print("fold",k,"rs",i," LR score:", clf.score(X_test, y_test)) |
| 177 | + y_pred=clf.predict(X_test) |
| 178 | + #indices = [idx+1 for idx, el in enumerate(rs_map) if el == i] #subject ids who are assigned rs role i |
| 179 | + for j in range(len(test_index)): |
| 180 | + if rs_map[int(y_test[j]-1)]==i: #subject y_test[j] is known to be in role i |
| 181 | + if y_pred[j]==y_test[j]: |
| 182 | + ctp=ctp+1 |
| 183 | + else: |
| 184 | + cfn=cfn+1 |
| 185 | + else: #subject y_test[j] is known to be not in role i |
| 186 | + if y_pred[j]!=y_test[j]: |
| 187 | + cfp1=cfp1+1 |
| 188 | + if rs_map[int(y_pred[j]-1)]==i: |
| 189 | + cfp=cfp+1 |
| 190 | + #labels = np.unique(y_train[0]) #y_train[0] is the first row of y_train: unique 1-based subject ids; equuivalently, we could use y_train[1~5] |
| 191 | + |
| 192 | + print("ctp =",ctp) |
| 193 | + print("ctn =",ctn) |
| 194 | + print("cfp =",cfp) |
| 195 | + print("cfn =",cfn) |
| 196 | + print("cfp1 =",cfp1) |
| 197 | + # (tn + tp) / (tn + fp + fn + tp) |
| 198 | + # acc = (conf_mat[0] + conf_mat[3]) / np.sum(conf_mat) |
| 199 | + # # fp / (tn + fp) |
| 200 | + # far = conf_mat[1] / (conf_mat[0] + conf_mat[1]) |
| 201 | + # # fn / (fn + tp) |
| 202 | + # frr = conf_mat[2] / (conf_mat[2] + conf_mat[3]) |
| 203 | + |
| 204 | + fi.write("Dataset -- {}\n".format(args["dataset"])) |
| 205 | + fi.write("BC -- {}\n".format(args["mode"])) |
| 206 | + fi.write("RS -- {}\n".format(args["role_dist"])) |
| 207 | + # fi.write("TN -- {:.6f}\n".format(conf_mat[0])) |
| 208 | + # fi.write("TP -- {:.6f}\n".format(conf_mat[3])) |
| 209 | + # fi.write("FP -- {:.6f}\n".format(conf_mat[1])) |
| 210 | + # fi.write("FN -- {:.6f}\n".format(conf_mat[2])) |
| 211 | + # fi.write("ACC -- {:.6f}\n".format(acc)) |
| 212 | + # fi.write("FAR -- {:.6f}\n".format(far)) |
| 213 | + # fi.write("FRR -- {:.6f}\n".format(frr)) |
| 214 | + fi.close() |
| 215 | + |
| 216 | +#on lfw, I got 5,5(under);27,29(bc bal);31,50(bc unbal) for fp,fn |
| 217 | +#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html |
| 218 | +# confusion_matrix is a function that computes confusion matrix to evaluate the accuracy of a classification. |
| 219 | +# By definition a confusion matrix C is such that C_i,j is equal to the number of observations known to be in group i and predicted to be in group j. |
| 220 | +# Thus in binary classification, the count of true negatives is C_0,0, false negatives is C_1,0, true positives is C_1,1 and false positives is C_0,1. |
0 commit comments