Skip to content

Commit d4b9024

Browse files
authored
Refactor of ML toolkit (#16)
* update to new ml/nlp version * update notebooks for ml refactor * update to new format of ml * fit->transform
1 parent 2ca1268 commit d4b9024

14 files changed

+1053
-1158
lines changed

notebooks/01 Decision Trees.ipynb

+11-239
Large diffs are not rendered by default.

notebooks/02 Random Forests.ipynb

+37-37
Large diffs are not rendered by default.

notebooks/03 Neural Networks.ipynb

+40-64
Large diffs are not rendered by default.

notebooks/04 Dimensionality Reduction.ipynb

+51-96
Large diffs are not rendered by default.

notebooks/05 Feature Engineering.ipynb

+10-10
Large diffs are not rendered by default.

notebooks/06 Feature Extraction and Selection.ipynb

+172-111
Large diffs are not rendered by default.

notebooks/07 Cross Validation.ipynb

+48-48
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@
148148
}
149149
],
150150
"source": [
151-
"show targets:exec diagnosis_M from .ml.onehot[targets;cols targets]"
151+
"show targets:exec diagnosis_M from .ml.oneHot.fitTransform[targets;cols targets]"
152152
]
153153
},
154154
{
@@ -218,7 +218,7 @@
218218
],
219219
"source": [
220220
"// add second order polynomial features to the table \n",
221-
"5#table:table^.ml.polytab[table;2]"
221+
"5#table:table^.ml.polyTab[table;2]"
222222
]
223223
},
224224
{
@@ -245,7 +245,7 @@
245245
],
246246
"source": [
247247
"/ complete standard scaling of the dataset to avoid biases due to orders of magnitude in the data\n",
248-
"5#table:.ml.minmaxscaler table"
248+
"5#table:.ml.minMaxScaler.fitTransform table"
249249
]
250250
},
251251
{
@@ -266,7 +266,7 @@
266266
],
267267
"source": [
268268
"/ complete a train-test-split on the data - below 20% of data is used in the test set\n",
269-
"show tts:.ml.traintestsplit[table;targets;.2]"
269+
"show tts:.ml.trainTestSplit[table;targets;.2]"
270270
]
271271
},
272272
{
@@ -301,7 +301,7 @@
301301
"a:{.p.import[`sklearn.ensemble][`:RandomForestClassifier]}\n",
302302
"\n",
303303
"/ scoring function which takes a function, parameters to apply to that function and data as arguments\n",
304-
"score_func:.ml.xv.fitscore[a][`n_estimators pykw 500]"
304+
"score_func:.ml.xv.fitScore[a][`n_estimators pykw 500]"
305305
]
306306
},
307307
{
@@ -316,16 +316,16 @@
316316
"Average Model Scores:\n",
317317
"----------------------------------------------------------------------------\n",
318318
"Sequential split indices with basic k-fold cross validation: 0.9736264\n",
319-
"Random split indices with basic k-fold cross validation: 0.9714286\n",
320-
"Stratified split indices with basic k-fold cross validation: 0.9736736\n"
319+
"Random split indices with basic k-fold cross validation: 0.9736264\n",
320+
"Stratified split indices with basic k-fold cross validation: 0.9758714\n"
321321
]
322322
}
323323
],
324324
"source": [
325325
"/ split data into k-folds and train/validate the model\n",
326-
"s1:.ml.xv.kfsplit[k;n;xtrain;ytrain;score_func] / sequentially split\n",
327-
"s2:.ml.xv.kfshuff[k;n;xtrain;ytrain;score_func] / randomized split\n",
328-
"s3:.ml.xv.kfstrat[k;n;xtrain;ytrain;score_func] / stratified split\n",
326+
"s1:.ml.xv.kfSplit[k;n;xtrain;ytrain;score_func] / sequentially split\n",
327+
"s2:.ml.xv.kfShuff[k;n;xtrain;ytrain;score_func] / randomized split\n",
328+
"s3:.ml.xv.kfStrat[k;n;xtrain;ytrain;score_func] / stratified split\n",
329329
"\n",
330330
"-1\"Average Model Scores:\";\n",
331331
"-1\"----------------------------------------------------------------------------\";\n",
@@ -352,19 +352,19 @@
352352
"text": [
353353
"Average Model Scores:\n",
354354
"----------------------------------------------------------------------------\n",
355-
"Monte-Carlo cross validation with 5 repetitions and training size of 80%: 0.9714286\n",
356-
"Repeated stratified cross validation, 5 fold, 5 repetitions: 0.9736264\n",
357-
"Repeated sequential cross validation, 5 fold, 5 repetitions: 0.9727473\n"
355+
"Monte-Carlo cross validation with 5 repetitions and training size of 80%: 0.967033\n",
356+
"Repeated stratified cross validation, 5 fold, 5 repetitions: 0.9727473\n",
357+
"Repeated sequential cross validation, 5 fold, 5 repetitions: 0.9740659\n"
358358
]
359359
}
360360
],
361361
"source": [
362362
"p:.2 / percentage of data in validation set\n",
363363
"n: 5 / number of repetitions\n",
364364
"\n",
365-
"r1:.ml.xv.mcsplit[p;n;xtrain;ytrain;score_func]\n",
366-
"r2:.ml.xv.kfshuff[k;n;xtrain;ytrain;score_func]\n",
367-
"r3:.ml.xv.kfsplit[k;n;xtrain;ytrain;score_func]\n",
365+
"r1:.ml.xv.mcSplit[p;n;xtrain;ytrain;score_func]\n",
366+
"r2:.ml.xv.kfShuff[k;n;xtrain;ytrain;score_func]\n",
367+
"r3:.ml.xv.kfSplit[k;n;xtrain;ytrain;score_func]\n",
368368
"\n",
369369
"-1\"Average Model Scores:\";\n",
370370
"-1\"----------------------------------------------------------------------------\";\n",
@@ -407,7 +407,7 @@
407407
"outputs": [],
408408
"source": [
409409
"/ new scoring function\n",
410-
"sf:.ml.xv.fitscore[a]\n",
410+
"sf:.ml.xv.fitScore[a]\n",
411411
"\n",
412412
"/ dictionary of parameters\n",
413413
"gs_hp:`n_estimators`criterion`max_depth!(10 50 100 500;`gini`entropy;2 5 10 20 30)"
@@ -435,35 +435,35 @@
435435
"\n",
436436
"n_estimators criterion max_depth| ..\n",
437437
"--------------------------------| -------------------------------------------..\n",
438-
"10 gini 2 | 0.956044 0.967033 0.956044 0.956044 0.9..\n",
439-
"10 gini 5 | 0.9230769 0.967033 0.956044 0.978022 0.9..\n",
440-
"10 gini 10 | 0.956044 0.967033 0.9230769 1 0.9..\n",
441-
"10 gini 20 | 0.967033 0.9450549 0.978022 0.956044 0.9..\n",
442-
"10 gini 30 | 0.9450549 0.956044 0.9450549 0.978022 0.9..\n",
443-
"10 entropy 2 | 0.9450549 0.9340659 0.9450549 0.9450549 0.9..\n",
444-
"10 entropy 5 | 0.956044 0.978022 0.967033 0.9230769 0.9..\n",
445-
"10 entropy 10 | 0.9450549 0.956044 0.956044 0.978022 0.9..\n",
446-
"10 entropy 20 | 0.9450549 0.9450549 0.978022 0.9450549 0.9..\n",
447-
"10 entropy 30 | 0.956044 0.967033 0.967033 0.978022 0.9..\n",
448-
"50 gini 2 | 0.9340659 0.956044 0.956044 0.967033 0.9..\n",
449-
"50 gini 5 | 0.956044 0.978022 0.956044 0.989011 0.9..\n",
450-
"50 gini 10 | 0.956044 0.978022 0.978022 0.967033 0.9..\n",
451-
"50 gini 20 | 0.956044 0.978022 0.967033 1 0.9..\n",
452-
"50 gini 30 | 0.9450549 0.967033 0.967033 0.978022 0.9..\n",
453-
"50 entropy 2 | 0.978022 0.967033 0.956044 0.9340659 0.9..\n",
454-
"50 entropy 5 | 0.9450549 0.967033 0.967033 0.989011 0.9..\n",
455-
"50 entropy 10 | 0.956044 0.978022 0.989011 0.967033 0.9..\n",
456-
"50 entropy 20 | 0.956044 0.978022 1 0.978022 0.9..\n",
457-
"50 entropy 30 | 0.956044 0.978022 0.978022 0.967033 0.9..\n",
458-
"100 gini 2 | 0.9450549 0.967033 0.967033 0.9450549 0.9..\n",
459-
"100 gini 5 | 0.956044 0.967033 0.989011 1 0.9..\n",
438+
"10 gini 2 | 0.956044 0.956044 0.956044 0.956044 0.9..\n",
439+
"10 gini 5 | 0.9450549 0.956044 0.956044 0.978022 0.9..\n",
440+
"10 gini 10 | 0.956044 0.9450549 0.978022 0.9450549 0.9..\n",
441+
"10 gini 20 | 0.956044 0.967033 0.9340659 0.978022 0.9..\n",
442+
"10 gini 30 | 0.9340659 0.978022 0.967033 0.9340659 0.9..\n",
443+
"10 entropy 2 | 0.9450549 0.9450549 0.967033 0.9450549 0.9..\n",
444+
"10 entropy 5 | 0.967033 0.978022 0.956044 0.9450549 0.9..\n",
445+
"10 entropy 10 | 0.956044 0.967033 0.967033 0.978022 0.9..\n",
446+
"10 entropy 20 | 0.9340659 0.967033 0.967033 0.978022 0.9..\n",
447+
"10 entropy 30 | 0.956044 0.978022 0.967033 0.956044 0.9..\n",
448+
"50 gini 2 | 0.956044 0.967033 0.967033 0.9340659 0.9..\n",
449+
"50 gini 5 | 0.956044 0.989011 0.967033 0.978022 0.9..\n",
450+
"50 gini 10 | 0.956044 0.967033 0.967033 1 0.9..\n",
451+
"50 gini 20 | 0.956044 0.978022 0.989011 0.967033 0.9..\n",
452+
"50 gini 30 | 0.956044 0.967033 0.956044 0.978022 0.9..\n",
453+
"50 entropy 2 | 0.967033 0.967033 0.956044 0.956044 0.9..\n",
454+
"50 entropy 5 | 0.967033 0.978022 0.967033 0.967033 0.9..\n",
455+
"50 entropy 10 | 0.978022 0.978022 0.967033 0.967033 0.9..\n",
456+
"50 entropy 20 | 0.956044 0.967033 0.956044 0.989011 0.9..\n",
457+
"50 entropy 30 | 0.978022 0.978022 0.9450549 0.989011 0.9..\n",
458+
"100 gini 2 | 0.967033 0.967033 0.967033 0.9340659 0.9..\n",
459+
"100 gini 5 | 0.967033 0.967033 0.978022 0.978022 0.9..\n",
460460
"..\n"
461461
]
462462
}
463463
],
464464
"source": [
465465
"-1\"Grid search: hyperparameters and resulting score from each fold:\\n\";\n",
466-
"show gr:.ml.gs.kfsplit[k;n;xtrain;ytrain;sf;gs_hp;0]"
466+
"show gr:.ml.gs.kfSplit[k;n;xtrain;ytrain;sf;gs_hp;0]"
467467
]
468468
},
469469
{
@@ -508,7 +508,7 @@
508508
{
509509
"data": {
510510
"text/plain": [
511-
"`n_estimators`criterion`max_depth!(500;`entropy;5)\n",
511+
"`n_estimators`criterion`max_depth!(500;`entropy;10)\n",
512512
"0.9824561\n"
513513
]
514514
},
@@ -518,7 +518,7 @@
518518
}
519519
],
520520
"source": [
521-
"-2#.ml.gs.kfsplit[k;n;flip value flip table;targets;sf;gs_hp;.2]"
521+
"-2#.ml.gs.kfSplit[k;n;flip value flip table;targets;sf;gs_hp;.2]"
522522
]
523523
},
524524
{
@@ -537,7 +537,7 @@
537537
"data": {
538538
"text/plain": [
539539
"`n_estimators`criterion`max_depth!(500;`gini;10)\n",
540-
"0.9561404\n"
540+
"0.9473684\n"
541541
]
542542
},
543543
"execution_count": 15,
@@ -546,7 +546,7 @@
546546
}
547547
],
548548
"source": [
549-
"-2#.ml.gs.kfsplit[k;n;flip value flip table;targets;sf;gs_hp;-.2]"
549+
"-2#.ml.gs.kfSplit[k;n;flip value flip table;targets;sf;gs_hp;-.2]"
550550
]
551551
},
552552
{
@@ -612,7 +612,7 @@
612612
{
613613
"data": {
614614
"text/plain": [
615-
"`n_estimators`criterion`max_depth!(410;`entropy;4)\n",
615+
"`n_estimators`criterion`max_depth!(130;`entropy;20)\n",
616616
"0.9912281\n"
617617
]
618618
},
@@ -622,7 +622,7 @@
622622
}
623623
],
624624
"source": [
625-
"-2#.ml.rs.kfsplit[k;n;flip value flip table;targets;sf;rdm_hp;.2]"
625+
"-2#.ml.rs.kfSplit[k;n;flip value flip table;targets;sf;rdm_hp;.2]"
626626
]
627627
},
628628
{
@@ -653,7 +653,7 @@
653653
{
654654
"data": {
655655
"text/plain": [
656-
"`n_estimators`criterion`max_depth!(378;`entropy;9)\n",
656+
"`n_estimators`criterion`max_depth!(316;`entropy;6)\n",
657657
"0.9824561\n"
658658
]
659659
},
@@ -663,7 +663,7 @@
663663
}
664664
],
665665
"source": [
666-
"-2#.ml.rs.kfsplit[k;n;flip value flip table;targets;sf;sbl_hp;.2]"
666+
"-2#.ml.rs.kfSplit[k;n;flip value flip table;targets;sf;sbl_hp;.2]"
667667
]
668668
},
669669
{

notebooks/08 Natural Language Processing.ipynb

+114-112
Large diffs are not rendered by default.

notebooks/09 K Nearest Neighbours.ipynb

+6-6
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)