66
77#include < shogun/io/OpenMLFlow.h>
88#include < shogun/util/factory.h>
9+ #include < shogun/labels/Labels.h>
910
1011#include < rapidjson/document.h>
1112#ifdef HAVE_CURL
13+ #include " OpenMLFlow.h"
1214#include < curl/curl.h>
15+
1316#endif // HAVE_CURL
1417
1518using namespace shogun ;
@@ -39,6 +42,7 @@ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in)
3942/* OpenML server format */
4043const char * OpenMLReader::xml_server = " https://www.openml.org/api/v1/xml" ;
4144const char * OpenMLReader::json_server = " https://www.openml.org/api/v1/json" ;
45+ const char * OpenMLReader::download_server = " " ;
4246const char * OpenMLReader::splits_server = " https://www.openml.org/api_splits" ;
4347
4448/* DATA API */
@@ -58,7 +62,8 @@ const char* OpenMLReader::get_split = "/get/{}";
5862const std::unordered_map<std::string, std::string>
5963 OpenMLReader::m_format_options = {{" xml" , xml_server},
6064 {" json" , json_server},
61- {" split" , splits_server}};
65+ {" split" , splits_server},
66+ {" download" , download_server}};
6267const std::unordered_map<std::string, std::string>
6368 OpenMLReader::m_request_options = {
6469 {" dataset_description" , dataset_description},
@@ -298,7 +303,7 @@ std::shared_ptr<OpenMLFlow> OpenMLFlow::from_file()
298303}
299304
300305std::shared_ptr<OpenMLData>
301- OpenMLData::get_data (const std::string& id, const std::string& api_key)
306+ OpenMLData::get_dataset (const std::string& id, const std::string& api_key)
302307{
303308 // description
304309 Document document;
@@ -408,27 +413,189 @@ OpenMLData::get_data(const std::string& id, const std::string& api_key)
408413 default_target_attribute, row_id_attribute, ignore_attribute,
409414 version_label, citation, tags, visibility, original_data_url, paper_url,
410415 update_comment, md5_checksum, param_vector, qualities_vector);
416+ result->set_api_key (api_key);
417+ return result;
418+ }
411419
420+ std::shared_ptr<CCombinedFeatures> OpenMLData::get_features () noexcept
421+ {
422+ if (!m_cached_features)
423+ get_data ();
424+ return m_cached_features;
425+ }
426+
427+ std::shared_ptr<CCombinedFeatures> OpenMLData::get_features (const std::string& label)
428+ {
429+ auto find_label =
430+ std::find (m_feature_names.begin (), m_feature_names.end (), label);
431+ if (find_label == m_feature_names.end ())
432+ SG_SERROR (
433+ " Requested label \" %s\" not in the dataset!\n " , label.c_str ())
434+ if (!m_cached_features)
435+ get_data ();
436+ auto col_idx = std::distance (m_feature_names.begin (), find_label);
437+ auto result = std::shared_ptr<CCombinedFeatures>(m_cached_features->clone ()->as <CCombinedFeatures>());
438+ if (result->delete_feature_obj (col_idx))
439+ SG_SERROR (" Error deleting the label column in CombinedFeatures!\n " )
412440 return result;
413441}
414442
415- std::string OpenMLData::get_data_buffer ( const std::string& api_key )
443+ std::shared_ptr<CLabels> OpenMLData::get_labels ( )
416444{
417- SG_SNOTIMPLEMENTED;
445+ REQUIRE (
446+ !m_default_target_attribute.empty (),
447+ " A default target attribute is required if no label is given!\n " )
448+ return get_labels (m_default_target_attribute);
449+ }
450+
451+ std::shared_ptr<CLabels> OpenMLData::get_labels (const std::string& label_name)
452+ {
453+ auto find_label =
454+ std::find (m_feature_names.begin (), m_feature_names.end (), label_name);
455+ if (find_label == m_feature_names.end ())
456+ SG_SERROR (
457+ " Requested label \" %s\" not in the dataset!\n " , label_name.c_str ())
458+ auto col_idx = std::distance (m_feature_names.begin (), find_label);
459+
460+ if (!m_cached_features)
461+ get_data ();
462+
463+ auto target_label_as_feat =
464+ std::shared_ptr<CFeatures>(m_cached_features->get_feature_obj (col_idx));
465+
466+ // TODO: replace with actual enum values
467+ switch (m_feature_types[col_idx])
468+ {
469+ // real features
470+ case 0 :
471+ {
472+ auto casted_feat = std::dynamic_pointer_cast<CDenseFeatures<float64_t >>(target_label_as_feat);
473+ auto labels_vec = casted_feat->get_feature_vector (0 );
474+ auto labels = std::make_shared<CRegressionLabels>();
475+ labels->set_values (labels_vec);
476+ return labels;
477+ } break ;
478+ // nominal features
479+ case 1 :
480+ {
481+ auto casted_feat = std::dynamic_pointer_cast<CDenseFeatures<float64_t >>(target_label_as_feat);
482+ auto labels_vec = casted_feat->get_feature_vector (0 );
483+ auto labels = std::make_shared<CMulticlassLabels>();
484+ labels->set_values (labels_vec);
485+ return labels;
486+ } break ;
487+ default :
488+ SG_SERROR (" Unknown type for label \" %s\" !\n " , label_name.c_str ())
489+ }
490+
418491 return nullptr ;
419492}
420493
494+ void OpenMLData::get_data ()
495+ {
496+ auto reader = OpenMLReader (m_api_key);
497+ auto return_string = reader.get (m_url);
498+
499+ // TODO: add ARFF parsing and don't forget feature names and feature types
500+ m_cached_features = std::make_shared<CCombinedFeatures>();
501+ }
502+
421503std::shared_ptr<OpenMLSplit>
422504OpenMLSplit::get_split (const std::string& split_url, const std::string& api_key)
423505{
424- Document document;
425-
426506 auto reader = OpenMLReader (api_key);
427507 auto return_string = reader.get (" get_split" , " split" , split_url);
508+
509+ if (return_string == " Task not providing datasplits." )
510+ return std::make_shared<OpenMLSplit>();
511+
428512 auto return_stream = std::istringstream (return_string);
429- // add ARFF parsing here
430- SG_SNOTIMPLEMENTED
431- return nullptr ;
513+ // TODO: add ARFF parsing here
514+ // get train/test indices
515+ // TODO: replace line below with ARFFDeserialiser::get_features()
516+ auto arff_features = std::make_shared<CCombinedFeatures>();
517+ REQUIRE (
518+ arff_features->get_num_feature_obj () == 4 ,
519+ " Expected a ARFF file with 4 attributes: type, rowid, repeat and "
520+ " fold.\n " )
521+
522+ auto train_test_feat =
523+ std::shared_ptr<CFeatures>(arff_features->get_feature_obj (0 ));
524+ auto rowid_feat =
525+ std::shared_ptr<CFeatures>(arff_features->get_feature_obj (1 ));
526+ auto repeat_feat =
527+ std::shared_ptr<CFeatures>(arff_features->get_feature_obj (2 ));
528+ auto fold_feat =
529+ std::shared_ptr<CFeatures>(arff_features->get_feature_obj (3 ));
530+
531+ auto type_vector = string_feature_to_vector (train_test_feat);
532+ auto rowid_vector = dense_feature_to_vector (rowid_feat);
533+ auto repeat_vector = dense_feature_to_vector (repeat_feat);
534+ auto fold_vector = dense_feature_to_vector (fold_feat);
535+
536+ std::vector<std::vector<int64_t >> train_idx, test_idx;
537+ for (int i = 0 ; i < arff_features->get_num_vectors (); ++i)
538+ {
539+ if (type_vector[i] == LabelType::TRAIN)
540+ train_idx.emplace_back (std::initializer_list<int64_t >{
541+ static_cast <int64_t >(rowid_vector[i]),
542+ static_cast <int64_t >(repeat_vector[i]),
543+ static_cast <int64_t >(fold_vector[i])});
544+ else
545+ test_idx.emplace_back (std::initializer_list<int64_t >{
546+ static_cast <int64_t >(rowid_vector[i]),
547+ static_cast <int64_t >(repeat_vector[i]),
548+ static_cast <int64_t >(fold_vector[i])});
549+ }
550+
551+ return std::make_shared<OpenMLSplit>(train_idx, test_idx);
552+ }
553+
554+ SGVector<float64_t >
555+ OpenMLSplit::dense_feature_to_vector (const std::shared_ptr<CFeatures>& feat)
556+ {
557+ auto casted_feat =
558+ std::dynamic_pointer_cast<CDenseFeatures<float64_t >>(feat);
559+ // this should never happen
560+ if (!casted_feat)
561+ SG_SERROR (" Error casting a column in the split file from CFeatures to "
562+ " CDenseFeatures!\n >" );
563+ return casted_feat->get_feature_vector (0 );
564+ }
565+
566+ std::vector<OpenMLSplit::LabelType>
567+ OpenMLSplit::string_feature_to_vector (const std::shared_ptr<CFeatures>& feat)
568+ {
569+ auto casted_feat = std::dynamic_pointer_cast<CStringFeatures<char >>(feat);
570+ // this should never happen
571+ if (!casted_feat)
572+ SG_SERROR (" Error casting a column in the split file from CFeatures to "
573+ " CStringFeatures!\n " );
574+
575+ auto to_lower = [](const std::string& line) {
576+ std::string result;
577+ std::transform (
578+ line.begin (), line.end (), std::back_inserter (result),
579+ [](uint8_t val) { return std::tolower (val); });
580+ return result;
581+ };
582+
583+ std::vector<OpenMLSplit::LabelType> result;
584+
585+ for (int i = 0 ; i < casted_feat->get_num_vectors (); ++i)
586+ {
587+ auto row = casted_feat->get_feature_vector (i);
588+ std::string label (1 , row[0 ]);
589+ for (auto j = 1 ; j < casted_feat->get_max_vector_length (); ++j)
590+ label.append (1 , row[j]);
591+ if (to_lower (label) == " train" )
592+ result.push_back (LabelType::TRAIN);
593+ else if (to_lower (label) == " test" )
594+ result.push_back (LabelType::TEST);
595+ else
596+ SG_SERROR (" Unknown label type in split file %s!\n " , label.c_str ())
597+ }
598+ return result;
432599}
433600
434601std::shared_ptr<OpenMLTask>
@@ -473,7 +640,7 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key)
473640 std::string dataset_id = dataset_info[" data_set_id" ].GetString ();
474641 std::string target_feature =
475642 dataset_info[" target_feature" ].GetString ();
476- openml_dataset = OpenMLData::get_data (dataset_id, api_key);
643+ openml_dataset = OpenMLData::get_dataset (dataset_id, api_key);
477644 }
478645 else if (
479646 strcmp (task_settings[" name" ].GetString (), " estimation_procedure" ) ==
@@ -496,8 +663,11 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key)
496663 " Unexpected number of parameters in parameter array "
497664 " of estimation_procedure.\n " )
498665 }
499- openml_split = std::make_shared<OpenMLSplit>(
500- split_id, split_type, split_url, split_parameters);
666+ REQUIRE (
667+ split_type == " crossvalidation" ,
668+ " Currently only tasks with cross validation are enabled in "
669+ " shogun!\n " )
670+ openml_split = OpenMLSplit::get_split (split_url, api_key);
501671 }
502672 else if (
503673 strcmp (task_settings[" name" ].GetString (), " evaluation_measures" ) ==
@@ -877,7 +1047,16 @@ std::shared_ptr<OpenMLRun> OpenMLRun::run_flow_on_task(
8771047 std::shared_ptr<OpenMLFlow> flow, std::shared_ptr<OpenMLTask> task)
8781048{
8791049 auto data = task->get_dataset ();
880- SG_SNOTIMPLEMENTED
1050+ std::shared_ptr<CFeatures> train_features, test_features;
1051+ std::shared_ptr<CLabels> train_labels, test_labels;
1052+
1053+ if (task->get_split ()->contains_splits ())
1054+ SG_SNOTIMPLEMENTED
1055+ else
1056+ {
1057+ auto labels = data->get_labels ();
1058+ auto feat = data->get_features ();
1059+ }
8811060 return std::shared_ptr<OpenMLRun>();
8821061}
8831062
0 commit comments