Skip to content

Commit 0611f1c

Browse files
committed
Add support for cross-validation of ML models
1 parent 9cb1a87 commit 0611f1c

File tree

6 files changed

+302
-48
lines changed

6 files changed

+302
-48
lines changed

README.md

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ ExeKGLib is a Python library that simplifies the construction and execution of M
1111

1212
## 🌟 Features
1313

14-
1. **🔨 Construct** data analytics pipelines that take tabular files (e.g. CSV) as input and process the data using a variety of [available tasks and methods](https://boschresearch.github.io/ExeKGLib/supported-methods/).
14+
1. **🔨 Construct** data analytics pipelines that take tabular files (e.g. CSV) as input and process the data using a variety of [available tasks and methods](https://boschresearch.github.io/ExeKGLib/supported-tasks-and-methods/).
1515
2. **💾 Save** the constructed pipelines as ExeKGs in RDF Turtle format.
1616
3. **▶️ Execute** the generated ExeKGs.
1717

@@ -45,11 +45,17 @@ For detailed installation instructions, refer to the [installation page](https:/
4545
## 🚀 Getting started
4646

4747
[//]: # (--8<-- [start:gettingstarted])
48-
We provide [example Python files and a JSON file](https://github.com/boschresearch/ExeKGLib/tree/main/examples) that can be used to create the following pipelines:
48+
We provide [example Python and JSON files](https://github.com/boschresearch/ExeKGLib/tree/main/examples) that can be used to create the following pipelines:
4949

50-
1. **🧠 ML pipeline**: Loads a CSV dataset, concatenates selected features, splits the data into training and testing sets, trains a Support Vector Classifier model, tests the model, calculates performance metrics (accuracy, F1 score, precision, and recall), and visualizes the results in bar plots.
51-
2. **📊 Statistics pipeline**: Loads a specific feature from a CSV dataset, calculates its mean and standard deviation, and visualizes the feature's values using a line plot and the calculated statistics using a bar plot.
52-
3. **📈 Visualization pipeline**: The pipeline loads two numerical features from a CSV dataset and visualizes each feature's values using separate line plots.
50+
1. **🧠 ML pipeline**:
51+
1. `ml_pipeline_creation[from_json].py` and `MLPipeline.json`: Loads a CSV dataset, concatenates selected features, splits the data into training and testing sets, trains a Support Vector Classifier model, tests the model, calculates performance metrics (accuracy, F1 score, precision, and recall), and visualizes the results in bar plots.
52+
2. `MLPipelineExtended.json`: An extended version of the above ML pipeline that adds a data splitting step for Stratified K-Fold Cross-Validation. Then, it trains and tests the model using the cross-validation technique and visualizes the validation and test F1 scores in bar plots.
53+
2. **📊 Statistics pipeline**:
54+
- `stats_pipeline_creation.py`: Loads a specific feature from a CSV dataset, calculates its mean and standard deviation, and visualizes the feature's values using a line plot and the calculated statistics using a bar plot.
55+
3. **📈 Visualization pipeline**:
56+
- `visu_pipeline_creation.py`: The pipeline loads two numerical features from a CSV dataset and visualizes each feature's values using separate line plots.
57+
58+
> 🗒️ **Note**: The naming convention for output names (used as inputs for subsequent tasks) in `.json` files can be found in `exe_kg_lib/utils/string_utils.py`. Look for `TASK_OUTPUT_NAME_REGEX`.
5359
5460
[//]: # (--8<-- [end:gettingstarted])
5561

examples/MLPipeline.json

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "MLPipeline",
3-
"input_data_path": "/PATH/TO/THIS/DIR/data/dummy_data.csv",
4-
"output_plots_dir": "/PATH/TO/THIS/DIR/plots/MLPipeline",
3+
"input_data_path": "C:\\Users\\klr2rng\\Documents\\projects\\ExeKGLib-internal\\examples\\data\\dummy_data.csv",
4+
"output_plots_dir": "C:\\Users\\klr2rng\\Documents\\projects\\ExeKGLib-internal\\examples\\plots\\MLPipeline",
55
"data_entities": [
66
{
77
"name": "feature_1",
@@ -69,7 +69,7 @@
6969
},
7070
"input_data_entity_dict": {
7171
"DataInDataSplittingX": [
72-
"DataOutConcatenatedData_Concatenation1_ConcatenationMethod"
72+
"DataOutConcatenatedData_Concatenation1_MLPipeline_ConcatenationMethod"
7373
],
7474
"DataInDataSplittingY": [
7575
"label"
@@ -91,10 +91,10 @@
9191
},
9292
"input_data_entity_dict": {
9393
"DataInTrainX": [
94-
"DataOutSplittedTrainDataX_DataSplitting1_TrainTestSplitMethod"
94+
"DataOutSplittedTrainDataX_DataSplitting1_MLPipeline_TrainTestSplitMethod"
9595
],
9696
"DataInTrainY": [
97-
"DataOutSplittedTrainDataY_DataSplitting1_TrainTestSplitMethod"
97+
"DataOutSplittedTrainDataY_DataSplitting1_MLPipeline_TrainTestSplitMethod"
9898
]
9999
},
100100
"output_names": [
@@ -108,10 +108,10 @@
108108
"method_params_dict": {},
109109
"input_data_entity_dict": {
110110
"DataInTestModel": [
111-
"DataOutTrainModel_BinaryClassification1_SVCMethod"
111+
"DataOutTrainModel_BinaryClassification1_MLPipeline_SVCMethod"
112112
],
113113
"DataInTestX": [
114-
"DataOutSplittedTestDataX_DataSplitting1_TrainTestSplitMethod"
114+
"DataOutSplittedTestDataX_DataSplitting1_MLPipeline_TrainTestSplitMethod"
115115
]
116116
},
117117
"output_names": [
@@ -125,10 +125,10 @@
125125
"method_params_dict": {},
126126
"input_data_entity_dict": {
127127
"DataInRealY": [
128-
"DataOutSplittedTestDataY_DataSplitting1_TrainTestSplitMethod"
128+
"DataOutSplittedTestDataY_DataSplitting1_MLPipeline_TrainTestSplitMethod"
129129
],
130130
"DataInPredictedY": [
131-
"DataOutPredictedValueTest_Test1_TestMethod"
131+
"DataOutPredictedValueTest_Test1_MLPipeline_TestMethod"
132132
]
133133
},
134134
"output_names": [
@@ -142,10 +142,10 @@
142142
"method_params_dict": {},
143143
"input_data_entity_dict": {
144144
"DataInRealY": [
145-
"DataOutSplittedTestDataY_DataSplitting1_TrainTestSplitMethod"
145+
"DataOutSplittedTestDataY_DataSplitting1_MLPipeline_TrainTestSplitMethod"
146146
],
147147
"DataInPredictedY": [
148-
"DataOutPredictedValueTest_Test1_TestMethod"
148+
"DataOutPredictedValueTest_Test1_MLPipeline_TestMethod"
149149
]
150150
},
151151
"output_names": [
@@ -159,10 +159,10 @@
159159
"method_params_dict": {},
160160
"input_data_entity_dict": {
161161
"DataInRealY": [
162-
"DataOutSplittedTestDataY_DataSplitting1_TrainTestSplitMethod"
162+
"DataOutSplittedTestDataY_DataSplitting1_MLPipeline_TrainTestSplitMethod"
163163
],
164164
"DataInPredictedY": [
165-
"DataOutPredictedValueTest_Test1_TestMethod"
165+
"DataOutPredictedValueTest_Test1_MLPipeline_TestMethod"
166166
]
167167
},
168168
"output_names": [
@@ -176,10 +176,10 @@
176176
"method_params_dict": {},
177177
"input_data_entity_dict": {
178178
"DataInRealY": [
179-
"DataOutSplittedTestDataY_DataSplitting1_TrainTestSplitMethod"
179+
"DataOutSplittedTestDataY_DataSplitting1_MLPipeline_TrainTestSplitMethod"
180180
],
181181
"DataInPredictedY": [
182-
"DataOutPredictedValueTest_Test1_TestMethod"
182+
"DataOutPredictedValueTest_Test1_MLPipeline_TestMethod"
183183
]
184184
},
185185
"output_names": [
@@ -206,8 +206,8 @@
206206
},
207207
"input_data_entity_dict": {
208208
"DataInToPlot": [
209-
"DataOutScore_PerformanceCalculation1_AccuracyScoreMethod",
210-
"DataOutScore_PerformanceCalculation2_F1ScoreMethod"
209+
"DataOutScore_PerformanceCalculation1_MLPipeline_AccuracyScoreMethod",
210+
"DataOutScore_PerformanceCalculation2_MLPipeline_F1ScoreMethod"
211211
]
212212
},
213213
"output_names": []
@@ -221,8 +221,8 @@
221221
},
222222
"input_data_entity_dict": {
223223
"DataInToPlot": [
224-
"DataOutScore_PerformanceCalculation3_PrecisionScoreMethod",
225-
"DataOutScore_PerformanceCalculation4_RecallScoreMethod"
224+
"DataOutScore_PerformanceCalculation3_MLPipeline_PrecisionScoreMethod",
225+
"DataOutScore_PerformanceCalculation4_MLPipeline_RecallScoreMethod"
226226
]
227227
},
228228
"output_names": []

examples/MLPipelineExtended.json

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
{
2+
"name": "MLPipelineExtended",
3+
"input_data_path": "C:\\Users\\klr2rng\\Documents\\projects\\ExeKGLib-internal\\examples\\data\\dummy_data.csv",
4+
"output_plots_dir": "C:\\Users\\klr2rng\\Documents\\projects\\ExeKGLib-internal\\examples\\plots\\MLPipelineExtended",
5+
"data_entities": [
6+
{
7+
"name": "feature_1",
8+
"source": "feature_1",
9+
"data_semantics": "Numerical",
10+
"data_structure": "Vector"
11+
},
12+
{
13+
"name": "feature_2",
14+
"source": "feature_2",
15+
"data_semantics": "Numerical",
16+
"data_structure": "Vector"
17+
},
18+
{
19+
"name": "feature_3",
20+
"source": "feature_3",
21+
"data_semantics": "Numerical",
22+
"data_structure": "Vector"
23+
},
24+
{
25+
"name": "feature_4",
26+
"source": "feature_4",
27+
"data_semantics": "Numerical",
28+
"data_structure": "Vector"
29+
},
30+
{
31+
"name": "feature_5",
32+
"source": "feature_5",
33+
"data_semantics": "Numerical",
34+
"data_structure": "Vector"
35+
},
36+
{
37+
"name": "label",
38+
"source": "label",
39+
"data_semantics": "Categorical",
40+
"data_structure": "Vector"
41+
}
42+
],
43+
"tasks": [
44+
{
45+
"kg_schema_short": "ml",
46+
"task_type": "Concatenation",
47+
"method_type": "ConcatenationMethod",
48+
"method_params_dict": {},
49+
"input_data_entity_dict": {
50+
"DataInConcatenation": [
51+
"feature_1",
52+
"feature_2",
53+
"feature_3",
54+
"feature_4",
55+
"feature_5"
56+
]
57+
},
58+
"output_names": [
59+
"DataOutConcatenatedData"
60+
]
61+
},
62+
{
63+
"kg_schema_short": "ml",
64+
"task_type": "DataSplitting",
65+
"method_type": "TrainTestSplitMethod",
66+
"method_params_dict": {
67+
"hasParamTestSize": 0.2,
68+
"hasParamRandomState": 0
69+
},
70+
"input_data_entity_dict": {
71+
"DataInDataSplittingX": [
72+
"DataOutConcatenatedData_Concatenation1_MLPipeline_ConcatenationMethod"
73+
],
74+
"DataInDataSplittingY": [
75+
"label"
76+
]
77+
},
78+
"output_names": [
79+
"DataOutSplittedTestDataX",
80+
"DataOutSplittedTestDataY",
81+
"DataOutSplittedTrainDataX",
82+
"DataOutSplittedTrainDataY"
83+
]
84+
},
85+
{
86+
"kg_schema_short": "ml",
87+
"task_type": "DataSplitting",
88+
"method_type": "StratifiedKFoldMethod",
89+
"method_params_dict": {
90+
"hasParamNSplits": 3
91+
},
92+
"input_data_entity_dict": {
93+
"DataInDataSplittingX": [
94+
"DataOutSplittedTrainDataX_DataSplitting1_MLPipeline_TrainTestSplitMethod"
95+
],
96+
"DataInDataSplittingY": [
97+
"DataOutSplittedTrainDataY_DataSplitting1_MLPipeline_TrainTestSplitMethod"
98+
]
99+
},
100+
"output_names": [
101+
"DataOutSplittedTestDataX",
102+
"DataOutSplittedTestDataY",
103+
"DataOutSplittedTrainDataX",
104+
"DataOutSplittedTrainDataY"
105+
]
106+
},
107+
{
108+
"kg_schema_short": "ml",
109+
"task_type": "BinaryClassification",
110+
"method_type": "SVCMethod",
111+
"method_params_dict": {
112+
"hasParamRandomState": 0
113+
},
114+
"input_data_entity_dict": {
115+
"DataInTrainX": [
116+
"DataOutSplittedTrainDataX_DataSplitting2_MLPipeline_StratifiedKFoldMethod"
117+
],
118+
"DataInTrainY": [
119+
"DataOutSplittedTrainDataY_DataSplitting2_MLPipeline_StratifiedKFoldMethod"
120+
]
121+
},
122+
"output_names": [
123+
"DataOutTrainModel"
124+
]
125+
},
126+
{
127+
"kg_schema_short": "ml",
128+
"task_type": "Test",
129+
"method_type": "TestMethod",
130+
"method_params_dict": {},
131+
"input_data_entity_dict": {
132+
"DataInTestModel": [
133+
"DataOutTrainModel_BinaryClassification1_MLPipeline_SVCMethod"
134+
],
135+
"DataInTestX": [
136+
"DataOutSplittedTestDataX_DataSplitting2_MLPipeline_StratifiedKFoldMethod"
137+
]
138+
},
139+
"output_names": [
140+
"DataOutPredictedValueTest"
141+
]
142+
},
143+
{
144+
"kg_schema_short": "ml",
145+
"task_type": "Test",
146+
"method_type": "TestMethod",
147+
"method_params_dict": {},
148+
"input_data_entity_dict": {
149+
"DataInTestModel": [
150+
"DataOutTrainModel_BinaryClassification1_MLPipeline_SVCMethod"
151+
],
152+
"DataInTestX": [
153+
"DataOutSplittedTestDataX_DataSplitting1_MLPipeline_TrainTestSplitMethod"
154+
]
155+
},
156+
"output_names": [
157+
"DataOutPredictedValueTest"
158+
]
159+
},
160+
{
161+
"kg_schema_short": "ml",
162+
"task_type": "PerformanceCalculation",
163+
"method_type": "F1ScoreMethod",
164+
"method_params_dict": {},
165+
"input_data_entity_dict": {
166+
"DataInRealY": [
167+
"DataOutSplittedTestDataY_DataSplitting2_MLPipeline_StratifiedKFoldMethod"
168+
],
169+
"DataInPredictedY": [
170+
"DataOutPredictedValueTest_Test1_MLPipeline_TestMethod"
171+
]
172+
},
173+
"output_names": [
174+
"DataOutScore"
175+
]
176+
},
177+
{
178+
"kg_schema_short": "ml",
179+
"task_type": "PerformanceCalculation",
180+
"method_type": "F1ScoreMethod",
181+
"method_params_dict": {},
182+
"input_data_entity_dict": {
183+
"DataInRealY": [
184+
"DataOutSplittedTestDataY_DataSplitting1_MLPipeline_TrainTestSplitMethod"
185+
],
186+
"DataInPredictedY": [
187+
"DataOutPredictedValueTest_Test2_MLPipeline_TestMethod"
188+
]
189+
},
190+
"output_names": [
191+
"DataOutScore"
192+
]
193+
},
194+
{
195+
"kg_schema_short": "visu",
196+
"task_type": "CanvasCreation",
197+
"method_type": "CanvasMethod",
198+
"method_params_dict": {
199+
"hasParamLayout": "2 1",
200+
"hasParamFigureSize": "10 10"
201+
},
202+
"input_data_entity_dict": {},
203+
"output_names": []
204+
},
205+
{
206+
"kg_schema_short": "visu",
207+
"task_type": "BarPlotting",
208+
"method_type": "BarMethod",
209+
"method_params_dict": {
210+
"hasParamTitle": "Validation F1-score and Test F1-score"
211+
},
212+
"input_data_entity_dict": {
213+
"DataInToPlot": [
214+
"DataOutScore_PerformanceCalculation1_MLPipeline_F1ScoreMethod",
215+
"DataOutScore_PerformanceCalculation2_MLPipeline_F1ScoreMethod"
216+
]
217+
},
218+
"output_names": []
219+
}
220+
]
221+
}

exe_kg_lib/classes/exe_kg_mixins/exe_kg_construction_mixin.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ def add_task(
200200
method_parent.iri, self.top_level_schema.namespace_prefix, self.input_kg
201201
)
202202

203+
initial_method_params_dict = method_params_dict.copy()
203204
provided_params_num = len(method_params_dict)
204205
added_params_num = 0
205206
# add data properties to the task with given values
@@ -227,7 +228,7 @@ def add_task(
227228
kg_schema_short,
228229
task,
229230
method,
230-
method_params_dict,
231+
initial_method_params_dict,
231232
input_data_entity_dict,
232233
output_names,
233234
)

0 commit comments

Comments
 (0)