Merge pull request #754 from EducationalTestingService/fix-yaml

tamarl08 · web-flow · commit 7ae88dbe35d1 · 2023-11-07T13:16:37.000-05:00
Fix calls to yaml.load
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,3 +8,4 @@ select = ["D", "E", "F", "I"]
 ignore = ["D212"]
 line-length = 100
 target-version = "py38"
+fix = true
diff --git a/skll/config/__init__.py b/skll/config/__init__.py
@@ -18,7 +18,7 @@
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import numpy as np
-import ruamel.yaml as yaml
+from ruamel.yaml import YAML
 
 from skll.data.readers import safe_float
 from skll.types import ClassMap, FoldMapping, LabelType, PathOrStr
@@ -610,7 +610,9 @@ def parse_config_file(
         raise ValueError(
             "Configuration file does not contain list of learners " "in [Input] section."
         )
-    learners = yaml.safe_load(fix_json(learners_string))
+
+    yaml = YAML(typ="safe", pure=True)
+    learners = yaml.load(fix_json(learners_string))
 
     if len(learners) == 0:
         raise ValueError(
@@ -630,7 +632,7 @@ def parse_config_file(
     custom_metric_path = locate_file(config.get("Input", "custom_metric_path"), config_dir)
 
     # get the featuresets
-    featuresets = yaml.safe_load(config.get("Input", "featuresets"))
+    featuresets = yaml.load(config.get("Input", "featuresets"))
 
     # ensure that featuresets is either a list of features or a list of lists
     # of features
@@ -641,7 +643,7 @@ def parse_config_file(
             f"specified: {featuresets}"
         )
 
-    featureset_names = yaml.safe_load(fix_json(config.get("Input", "featureset_names")))
+    featureset_names = yaml.load(fix_json(config.get("Input", "featureset_names")))
 
     # ensure that featureset_names is a list of strings, if specified
     if featureset_names:
@@ -658,7 +660,7 @@ def parse_config_file(
     # learners. If it's not specified, then we just assume
     # that we are using 10 folds for each learner.
     learning_curve_cv_folds_list_string = config.get("Input", "learning_curve_cv_folds_list")
-    learning_curve_cv_folds_list = yaml.safe_load(fix_json(learning_curve_cv_folds_list_string))
+    learning_curve_cv_folds_list = yaml.load(fix_json(learning_curve_cv_folds_list_string))
     if len(learning_curve_cv_folds_list) == 0:
         learning_curve_cv_folds_list = [10] * len(learners)
     else:
@@ -679,7 +681,7 @@ def parse_config_file(
     # floats (proportions). If it's not specified, then we just
     # assume that we are using np.linspace(0.1, 1.0, 5).
     learning_curve_train_sizes_string = config.get("Input", "learning_curve_train_sizes")
-    learning_curve_train_sizes = yaml.safe_load(fix_json(learning_curve_train_sizes_string))
+    learning_curve_train_sizes = yaml.load(fix_json(learning_curve_train_sizes_string))
     if len(learning_curve_train_sizes) == 0:
         learning_curve_train_sizes = np.linspace(0.1, 1.0, 5).tolist()
     else:
@@ -698,9 +700,9 @@ def parse_config_file(
     # do we need to shuffle the training data
     do_shuffle = config.getboolean("Input", "shuffle")
 
-    fixed_parameter_list = yaml.safe_load(fix_json(config.get("Input", "fixed_parameters")))
-    fixed_sampler_parameters = yaml.safe_load(fix_json(config.get("Input", "sampler_parameters")))
-    param_grid_list = yaml.safe_load(fix_json(config.get("Tuning", "param_grids")))
+    fixed_parameter_list = yaml.load(fix_json(config.get("Input", "fixed_parameters")))
+    fixed_sampler_parameters = yaml.load(fix_json(config.get("Input", "sampler_parameters")))
+    param_grid_list = yaml.load(fix_json(config.get("Tuning", "param_grids")))
 
     # read and normalize the value of `pos_label`
     pos_label_string = safe_float(config.get("Tuning", "pos_label"))
@@ -804,7 +806,8 @@ def parse_config_file(
 
     # Get class mapping dictionary if specified
     class_map_string = config.get("Input", "class_map")
-    original_class_map = yaml.safe_load(fix_json(class_map_string))
+    yaml = YAML(typ="safe", pure=True)
+    original_class_map = yaml.load(fix_json(class_map_string))
     if original_class_map:
         # Change class_map to map from originals to replacements instead of
         # from replacement to list of originals
diff --git a/skll/config/utils.py b/skll/config/utils.py
@@ -13,7 +13,7 @@
 from pathlib import Path
 from typing import Iterable, List, Union
 
-import ruamel.yaml as yaml
+from ruamel.yaml import YAML
 
 from skll.types import FoldMapping, PathOrStr
 
@@ -186,7 +186,8 @@ def _parse_and_validate_metrics(metrics: str, option_name: str, logger=None) ->
 
     # make sure the given metrics data type is a list
     # and parse it correctly
-    metrics = yaml.safe_load(fix_json(metrics))
+    yaml = YAML(typ="safe", pure=True)
+    metrics = yaml.load(fix_json(metrics))
     if not isinstance(metrics, list):
         raise TypeError(f"{option_name} should be a list, not a " f"{type(metrics)}.")
 
diff --git a/skll/experiments/output.py b/skll/experiments/output.py
@@ -22,8 +22,8 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-import ruamel.yaml as yaml
 import seaborn as sns
+from ruamel.yaml import YAML
 
 from skll.types import FoldMapping, PathOrStr
 from skll.utils.logging import get_skll_logger
@@ -638,6 +638,8 @@ def _write_summary_file(result_json_paths: List[str], output_file: IO[str], abla
     # Map from feature set names to all features in them
     all_features = defaultdict(set)
     logger = get_skll_logger("experiment")
+    yaml = YAML(typ="safe", pure=True)
+
     for json_path_str in result_json_paths:
         json_path = Path(json_path_str)
         if not json_path.exists():
@@ -654,7 +656,7 @@ def _write_summary_file(result_json_paths: List[str], output_file: IO[str], abla
                 featureset_name = obj[0]["featureset_name"]
                 if ablation != 0 and "_minus_" in featureset_name:
                     parent_set = featureset_name.split("_minus_", 1)[0]
-                    all_features[parent_set].update(yaml.safe_load(obj[0]["featureset"]))
+                    all_features[parent_set].update(yaml.load(obj[0]["featureset"]))
                 learner_result_dicts.extend(obj)
 
     # Build and write header
@@ -670,9 +672,7 @@ def _write_summary_file(result_json_paths: List[str], output_file: IO[str], abla
         featureset_name = lrd["featureset_name"]
         if ablation != 0:
             parent_set = featureset_name.split("_minus_", 1)[0]
-            ablated_features = all_features[parent_set].difference(
-                yaml.safe_load(lrd["featureset"])
-            )
+            ablated_features = all_features[parent_set].difference(yaml.load(lrd["featureset"]))
             lrd["ablated_features"] = ""
             if ablated_features:
                 lrd["ablated_features"] = json.dumps(sorted(ablated_features))