Skip to content

Commit 4fb37ee

Browse files
committed
Added PCA option to DAC model.
1 parent 10464fe commit 4fb37ee

15 files changed

+795
-204
lines changed

docker/compose/slycat-compose/requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ configparser
99
future
1010
routes
1111
scipy
12+
sklearn
1213
npTDMS
1314
pandas
1415
pysmb

web-client/slycat/web/client/dac_tdms.py

+36-18
Original file line numberDiff line numberDiff line change
@@ -88,21 +88,27 @@ def check_parser_parms (parms):
8888
"for min-num-shots and try again.")
8989

9090
# fourth parameter is number of landmarks
91-
if not (parms[3] == 0 or parms[3] >= 3):
92-
check_parser_msg.append("Number of landmarks must be zero or >= 3. Please " + \
93-
"provide a valid number of landmarks and try again.")
94-
95-
# fifth parameter is expected type
96-
if parms[4] != "General" and \
97-
parms[4] != "Overvoltage" and \
98-
parms[4] != "Sprytron":
91+
if parms[3] is not None:
92+
if not (parms[3] == 0 or parms[3] >= 3):
93+
check_parser_msg.append("Number of landmarks must be zero or >= 3. Please " + \
94+
"provide a valid number of landmarks and try again.")
95+
96+
# fifth parameter is number of PCA components
97+
if parms[4] < 2:
98+
check_parser_msg.append("Number of PCA components must be >= 2. Please provide " + \
99+
"a valid number of PCA components and try again.")
100+
101+
# sixth parameter is expected type
102+
if parms[5] != "General" and \
103+
parms[5] != "Overvoltage" and \
104+
parms[5] != "Sprytron":
99105
check_parser_msg.append ('Expected data type must be one of "General", ' + \
100106
'"Overvoltage" or "Sprytron". Please use one of those options ' + \
101107
'and try again.')
102108

103-
# sixth parameter is union or intersection (combination of time series)
104-
if parms[5] != "Union" and \
105-
parms[5] != "Intersection":
109+
# seventh parameter is union or intersection (combination of time series)
110+
if parms[6] != "Union" and \
111+
parms[6] != "Intersection":
106112
check_parser_msg.append ('Available methods for combining mismatched, ' + \
107113
'time points are "Union" and "Intersection". Please use one of those options ' + \
108114
'and try again.')
@@ -270,6 +276,7 @@ def create_model (arguments, log):
270276
# populate parameters
271277
parser_parms = [arguments.min_time_points, arguments.min_channels,
272278
arguments.min_num_shots, arguments.num_landmarks,
279+
arguments.num_PCA_comps,
273280
shot_type, union_type,
274281
not arguments.do_not_infer_channel_units,
275282
not arguments.do_not_infer_time_units]
@@ -279,6 +286,13 @@ def create_model (arguments, log):
279286
if check_parser_error != "":
280287
raise TDMSUploadError(check_parser_error)
281288

289+
# landmarks over-rides PCA comps
290+
if arguments.num_landmarks is not None:
291+
parser_parms[4] = False
292+
else:
293+
parser_parms[4] = True
294+
parser_parms[3] = arguments.num_PCA_comps
295+
282296
# compile suffixes to include if .zip file
283297
dac_parser = "dac-tdms-file-parser"
284298
if file_type == "zip":
@@ -328,18 +342,19 @@ def create_model (arguments, log):
328342
log("\t%s" % suffix)
329343

330344
log("Including TDMS file suffixes:")
331-
for suffix in parser_parms[8]:
345+
for suffix in parser_parms[9]:
332346
log("\t%s" % suffix)
333347

334348
# next list common parameters
335349
log("Minimum number of time steps per channel: %s" % parser_parms[0])
336350
log("Minumum number of channels: %s" % parser_parms[1])
337351
log("Minimum number of shots: %s" % parser_parms[2])
338352
log("Number of landmarks: %s" % parser_parms[3])
339-
log("Expecting TDMS data type: %s" % parser_parms[4])
340-
log("Combining mismatched time steps using: %s" % parser_parms[5])
341-
log("Infer channel units: %s" % parser_parms[6])
342-
log("Infer time units: %s" % parser_parms[7])
353+
log("Number of PCA components: %s" % parser_parms[4])
354+
log("Expecting TDMS data type: %s" % parser_parms[5])
355+
log("Combining mismatched time steps using: %s" % parser_parms[6])
356+
log("Infer channel units: %s" % parser_parms[7])
357+
log("Infer time units: %s" % parser_parms[9])
343358

344359
# upload model file(s)
345360
mid = upload_model (arguments, dac_parser, parser_parms, file_list, progress=True)
@@ -398,9 +413,12 @@ def parser ():
398413
help="Channels must occur in at least this many shots, integer >= 0. " +
399414
"Use zero to indicate that channel must occur in every shot. " +
400415
"Default: %(default)s.")
401-
parser.add_argument("--num-landmarks", default=200, type=int,
416+
parser.add_argument("--num-landmarks", default=None, type=int,
402417
help="Number of landmarks to use, integer >= 3. Can also use zero " +
403-
"to indicate use of full dataset (no landmarks).")
418+
"to indicate use of full dataset (no landmarks). Default: %(default)s.")
419+
parser.add_argument("--num-PCA-comps", default=10, type=int,
420+
help="Number of PCA components to use, integer >= 2. Note --num-landmarks " +
421+
"over-rides --num-PCA-comps. Default: %(default)s.")
404422
parser.add_argument("--overvoltage", action="store_true",
405423
help="Expecting overvoltage data.")
406424
parser.add_argument("--sprytron", action="store_true",

web-server/plugins/slycat-dac/dac-generic-file-parser.py

+51-9
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
# for dac_compute_coords.py and dac_upload_model.py
2222
import imp
2323

24+
# for error logging
25+
import cherrypy
26+
2427
# note this version assumes the first row is a header row, and keeps only the header
2528
# and data (called by the generic zip parser)
2629
def parse_table_file(file):
@@ -121,7 +124,7 @@ def parse_mat_file(file):
121124
# parse file using comma delimiter
122125
rows = [row for row in csv.reader(file.decode().splitlines(), delimiter=",", doublequote=True,
123126
escapechar=None, quotechar='"', quoting=csv.QUOTE_MINIMAL, skipinitialspace=True)]
124-
127+
125128
# check that we have a matrix
126129
num_rows = len(rows)
127130
num_cols = len(rows[0])
@@ -287,6 +290,7 @@ def parse_gen_zip(database, model, input, files, aids, **kwargs):
287290
# look for one occurrence (only) of .dac file and var, dist, and time directories
288291
dac_file = ""
289292
landmarks_file = ""
293+
pca_file = ""
290294
var_meta_file = ""
291295
var_files = []
292296
dist_files = []
@@ -308,6 +312,10 @@ def parse_gen_zip(database, model, input, files, aids, **kwargs):
308312
if zip_file == "landmarks.csv":
309313
landmarks_file = zip_file
310314

315+
# is is "pca.csv"?
316+
if zip_file == "pca.csv":
317+
pca_file = zip_file
318+
311319
# found a directory -- is it "var/"?
312320
elif head == "var":
313321

@@ -395,23 +403,23 @@ def parse_gen_zip(database, model, input, files, aids, **kwargs):
395403

396404
# check var file names
397405
num_vars = len(meta_vars)
398-
check_file_names(database, model, parse_error_log, dac_error,
406+
check_file_names(database, model, dac_error, parse_error_log,
399407
"var/variable_", ".var", num_vars, var_files,
400408
"missing variable_*.var file(s).")
401409

402410
parse_error_log = dac_error.update_parse_log (database, model, parse_error_log, "Progress",
403411
"Checked DAC variable file names.")
404412

405413
# check time file names
406-
check_file_names(database, model, parse_error_log, dac_error,
414+
check_file_names(database, model, dac_error, parse_error_log,
407415
"time/variable_", ".time", num_vars, time_files,
408416
"missing variable_*.time file(s).")
409417

410418
parse_error_log = dac_error.update_parse_log (database, model, parse_error_log, "Progress",
411419
"Checked DAC time file names.")
412420

413421
# check dist file names
414-
check_file_names(database, model, parse_error_log, dac_error,
422+
check_file_names(database, model, dac_error, parse_error_log,
415423
"dist/variable_", ".dist", num_vars, dist_files,
416424
"missing variable_*.dist file(s).")
417425

@@ -428,19 +436,32 @@ def parse_gen_zip(database, model, input, files, aids, **kwargs):
428436
landmarks = None
429437
if landmarks_file != "":
430438

431-
# parse variables.meta file
439+
# parse landmarks.csv file
432440
attr, dim, landmarks = parse_mat_file(zip_ref.read(landmarks_file))
433441

434442
else:
435443

436444
parse_error_log = dac_error.update_parse_log (database, model, parse_error_log, "Progress",
437445
"No landmarks.csv file found, using all data points.")
438446

447+
# load pca-comps file
448+
pca_comps = None
449+
if pca_file != "":
450+
451+
# parse pca.csv file
452+
attr, dim, pca_comps = parse_mat_file(zip_ref.read(pca_file))
453+
454+
else:
455+
456+
parse_error_log = dac_error.update_parse_log (database, model, parse_error_log, "Progress",
457+
"No pca.csv file found, using MDS algorithm.")
458+
439459
# now start thread to prevent timing out on large files
440460
stop_event = threading.Event()
441461
thread = threading.Thread(target=parse_gen_zip_thread,
442462
args=(database, model, zip_ref, dac_error, parse_error_log,
443-
meta_var_col_names, meta_vars, landmarks, dac_file, stop_event))
463+
meta_var_col_names, meta_vars, landmarks, pca_comps,
464+
dac_file, stop_event))
444465
thread.start()
445466

446467

@@ -463,7 +484,8 @@ def check_file_names (database, model, dac_error, parse_error_log,
463484

464485
# gen zip parsing thread to prevent time outs by browser
465486
def parse_gen_zip_thread(database, model, zip_ref, dac_error, parse_error_log,
466-
meta_var_col_names, meta_vars, landmarks, dac_file, stop_event):
487+
meta_var_col_names, meta_vars, landmarks, pca_comps,
488+
dac_file, stop_event):
467489

468490
# put entire thread into a try-except block in order report errors
469491
try:
@@ -480,8 +502,27 @@ def parse_gen_zip_thread(database, model, zip_ref, dac_error, parse_error_log,
480502
# number of data points
481503
num_datapoints = len(meta_rows)
482504

505+
# do pca check (pca over-rides landmarks)
506+
use_coordinates=False
507+
if pca_comps is not None:
508+
509+
num_pca_comps = int(numpy.round(pca_comps[0]))
510+
511+
# check that pca comps is at least two
512+
if num_pca_comps < 2:
513+
514+
dac_error.quit_raise_exception(database, model, parse_error_log,
515+
'Number of PCA components must be at least two.')
516+
517+
# set as number of landmarks
518+
num_landmarks = num_pca_comps
519+
use_coordinates = True
520+
521+
parse_error_log = dac_error.update_parse_log (database, model, parse_error_log, "Progress",
522+
"Using " + str(num_pca_comps) + " PCA components.")
523+
483524
# do landmark checks
484-
if landmarks is not None:
525+
elif landmarks is not None:
485526

486527
num_landmarks = len(landmarks)
487528

@@ -589,7 +630,8 @@ def parse_gen_zip_thread(database, model, zip_ref, dac_error, parse_error_log,
589630
push.init_upload_model(database, model, dac_error, parse_error_log,
590631
meta_column_names, meta_rows,
591632
meta_var_col_names, meta_vars,
592-
variable, time_steps, var_dist, landmarks=landmarks)
633+
variable, time_steps, var_dist,
634+
landmarks=landmarks, use_coordinates=use_coordinates)
593635

594636
# done -- destroy the thread
595637
stop_event.set()

0 commit comments

Comments
 (0)