Skip to content

Commit 2aeac30

Browse files
committed
Bug fix and more realistic test
1 parent bf43afb commit 2aeac30

File tree

4 files changed

+48
-17
lines changed

4 files changed

+48
-17
lines changed

dsbox/overfitdetector/detector.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class Detector(SupervisedLearnerPrimitiveBase[Input, Output, Params]):
4242
def __init__(self, *, n_sample_instances: int = 500, n_sample_iterations: int = 100, columns: list = list(),
4343
model: SupervisedLearnerPrimitiveBase = None) -> None:
4444
super().__init__()
45-
logging.basicConfig(level=logging.INFO)
45+
logging.basicConfig(level=logging.DEBUG)
4646
self.__logger = logging.getLogger(__name__)
4747
self.n_sample_iterations = n_sample_iterations
4848
self.n_sample_instances = n_sample_instances
@@ -373,19 +373,33 @@ def find_matching_rows(self, row_values, columns, training_data, num_neighbors=1
373373

374374
matches = training_data
375375
for (col, val) in zip(col_set, vals_for_col_set):
376+
tmp_matches = []
376377
if self.is_number(val):
377-
lower_bound = float(val) * (1.0 - real_value_extend)
378-
upper_bound = float(val)*(1.0 + real_value_extend)
379-
matches = matches[matches[col] >= lower_bound]
380-
matches = matches[matches[col] <= upper_bound]
378+
if isinstance(val, float):
379+
lower_bound = float(val) * (1.0 - real_value_extend)
380+
upper_bound = float(val) * (1.0 + real_value_extend)
381+
tmp_matches_lwr = matches[matches[col] >= lower_bound]
382+
tmp_matches_upr = matches[matches[col] <= upper_bound]
383+
tmp_matches = pandas.concat([tmp_matches_lwr, tmp_matches_upr])
384+
else:
385+
tmp_matches = matches[matches[col] == val]
381386
else:
382387
if val.isalnum():
383-
matches = matches[matches[col] == '%s' % val]
388+
tmp_matches = matches[matches[col] == '%s' % val]
384389
else:
385-
matches = matches[matches[col] == val]
390+
tmp_matches = matches[matches[col] == val]
391+
392+
if len(tmp_matches) > 0: # do the conjunction, but only as long as we have some data...
393+
matches = tmp_matches
394+
else:
395+
break
396+
397+
self.__logger.debug("Dataframe query: (%s, %s) found %d matches." % (col, str(val), len(matches)))
398+
#print("Dataframe query: (%s, %s) found %d matches." % (col, str(val), len(matches)))
386399

387-
self.__logger.debug("Dataframe query: %s found %d matches." % (qry_string, len(matches)))
388400
if len(matches) > 0:
401+
self.__logger.debug("Col set yielded %d matches total" % (len(matches)))
402+
#print("Col set yielded %d matches total" % (len(matches)))
389403
all_matches = all_matches.append(matches)
390404

391405
if len(all_matches) > 0:
41.3 KB
Binary file not shown.
2.35 KB
Binary file not shown.

tests/dsbox/overfit-detector/test_detector.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import numpy as np
66
from dsbox.overfitdetector.detector import Detector
77
from sklearn.linear_model import LogisticRegression
8+
from sklearn import preprocessing
89
import logging
910
import math
1011

@@ -13,7 +14,10 @@ class Detectorests(unittest.TestCase):
1314
def setUp(self):
1415
self.__detector = Detector()
1516
self.__dir_path = os.getcwd()
16-
self.__detector.set_logger(logging.ERROR)
17+
logging.basicConfig(level=logging.DEBUG)
18+
19+
self.__train_data_file = self.__dir_path+"/tests/dsbox/overfit-detector/test_data/trainData.csv.gz"
20+
self.__train_labels_file = self.__dir_path+"/tests/dsbox/overfit-detector/test_data/trainTargets.csv.gz"
1721

1822
datas = {
1923
"indep1": [],
@@ -48,15 +52,28 @@ def setUp(self):
4852
self.__test_df = pd.DataFrame(datas)
4953

5054
def test_detector(self):
51-
data = np.array([1., 2., 3., 4.])
52-
labels = np.array(1)
53-
for i in range(10):
54-
data = np.vstack([data, [1., 2., 3., 4.]])
55-
labels = np.append(labels, 1)
56-
for i in range(10):
57-
data = np.vstack([data, [2., 3., 4., 5.]])
58-
labels = np.append(labels, 0)
5955

56+
#data = np.array([1., 2., 3., 4.])
57+
#labels = np.array(1)
58+
#for i in range(10):
59+
# data = np.vstack([data, [1., 2., 3., 4.]])
60+
# labels = np.append(labels, 1)
61+
#for i in range(10):
62+
# data = np.vstack([data, [2., 3., 4., 5.]])
63+
# labels = np.append(labels, 0)
64+
65+
data = pd.read_csv(self.__train_data_file, header=0).fillna(0.0).replace('', '0')
66+
del data['d3mIndex']
67+
labels = pd.read_csv(self.__train_labels_file, header=0).fillna(0.0).replace('', '0')['Hall_of_Fame']
68+
69+
# Encode the categorical data in training data
70+
# Encode the categorical data in the test targets, uses the first target of the dataset as a target
71+
trainDataLabelEncoders = dict()
72+
for col in ['Player', 'Position']:
73+
trainDataLabelEncoders[col] = preprocessing.LabelEncoder().fit(data[col])
74+
data[col] = trainDataLabelEncoders[col].transform(data[col])
75+
76+
# Train the model
6077
mdl = LogisticRegression().fit(data, labels)
6178
dd = Detector(n_sample_instances=10, n_sample_iterations=20, columns=['0', '1', '2', '3'], model=mdl)
6279
dd.set_training_data(inputs=data, outputs=labels)

0 commit comments

Comments
 (0)